diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 254b75b784e75..d1e32307cb78a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -86,6 +86,8 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); VGPRExcessLimit = Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); + AGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::AGPR_32RegClass); SIMachineFunctionInfo &MFI = *MF->getInfo(); // Set the initial TargetOccupnacy to the maximum occupancy that we can @@ -98,6 +100,9 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { SGPRCriticalLimit = std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); + AGPRCriticalLimit = + std::min(ST.getMaxNumAGPRs(TargetOccupancy), AGPRExcessLimit); + if (!KnownExcessRP) { VGPRCriticalLimit = std::min( ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()), @@ -201,7 +206,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, - unsigned VGPRPressure, bool IsBottomUp) { + unsigned VGPRPressure, + unsigned AGPRPressure, bool IsBottomUp) { Cand.SU = SU; Cand.AtTop = AtTop; @@ -230,6 +236,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, Pressure.resize(4, 0); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = AGPRPressure; for (const auto &Diff : DAG->getPressureDiff(SU)) { if (!Diff.isValid()) @@ -247,7 +254,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != - CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { + CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] || + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] != + CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32]) { errs() << "Register Pressure is inaccurate when calculated through " "PressureDiff\n" << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] @@ -255,7 +264,10 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] << ", expected " - << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; + << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n" + << "AGPR got " << Pressure[AMDGPU::RegisterPressureSets::AGPR_32] + << ", expected " + << CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32] << "\n"; report_fatal_error("inaccurate register pressure calculation"); } #endif @@ -263,6 +275,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + unsigned NewAGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32]; // If two instructions increase the pressure of different register sets // by the same amount, the generic scheduler will prefer to schedule the @@ -272,9 +285,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // only for VGPRs or only for SGPRs. // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. - const unsigned MaxVGPRPressureInc = 16; + static constexpr unsigned MaxVGPRPressureInc = 16; bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; - bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; + bool ShouldTrackAGPRs = !ShouldTrackVGPRs && AGPRPressure >= AGPRExcessLimit; + bool ShouldTrackSGPRs = + !ShouldTrackVGPRs && !ShouldTrackAGPRs && SGPRPressure >= SGPRExcessLimit; // FIXME: We have to enter REG-EXCESS before we reach the actual threshold // to increase the likelihood we don't go over the limits. We should improve @@ -291,6 +306,12 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); } + if (ShouldTrackAGPRs && NewAGPRPressure >= AGPRExcessLimit) { + HasHighPressure = true; + Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::AGPR_32); + Cand.RPDelta.Excess.setUnitInc(NewAGPRPressure - AGPRExcessLimit); + } + if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { HasHighPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); @@ -304,13 +325,19 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; + int AGPRDelta = NewAGPRPressure - AGPRCriticalLimit; - if (SGPRDelta >= 0 || VGPRDelta >= 0) { + if (SGPRDelta >= 0 || VGPRDelta >= 0 || AGPRDelta >= 0) { HasHighPressure = true; - if (SGPRDelta > VGPRDelta) { + // Prioritize reducing the VGPRDelta if both are >= 0 + if (SGPRDelta > VGPRDelta && SGPRDelta > AGPRDelta) { Cand.RPDelta.CriticalMax = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); + } else if (AGPRDelta > VGPRDelta) { + Cand.RPDelta.CriticalMax = + PressureChange(AMDGPU::RegisterPressureSets::AGPR_32); + Cand.RPDelta.CriticalMax.setUnitInc(AGPRDelta); } else { Cand.RPDelta.CriticalMax = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); @@ -330,16 +357,19 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + unsigned AGPRPressure = 0; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + AGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32]; } else { GCNRPTracker *T = IsBottomUp ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); VGPRPressure = T->getPressure().getArchVGPRNum(); + AGPRPressure = T->getPressure().getAGPRNum(); } } ReadyQueue &Q = Zone.Available; @@ -347,7 +377,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, - VGPRPressure, IsBottomUp); + VGPRPressure, AGPRPressure, IsBottomUp); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 790370ff8ab4d..8b2137bcd14da 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -53,7 +53,8 @@ class GCNSchedStrategy : public GenericScheduler { void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, - unsigned VGPRPressure, bool IsBottomUp); + unsigned VGPRPressure, unsigned AGPRPressure, + bool IsBottomUp); std::vector Pressure; @@ -63,6 +64,8 @@ class GCNSchedStrategy : public GenericScheduler { unsigned VGPRExcessLimit; + unsigned AGPRExcessLimit; + unsigned TargetOccupancy; MachineFunction *MF; @@ -103,6 +106,8 @@ class GCNSchedStrategy : public GenericScheduler { unsigned VGPRCriticalLimit; + unsigned AGPRCriticalLimit; + unsigned SGPRLimitBias = 0; unsigned VGPRLimitBias = 0; @@ -183,8 +188,7 @@ class ScheduleMetrics { }; inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { - dbgs() << "\n Schedule Metric (scaled by " - << ScheduleMetrics::ScaleFactor + dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/" << Sm.getLength() << " ]\n"; return OS; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4475c8d1d1602..c9fa3894408e9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1722,8 +1722,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// unit requirement. unsigned getMaxNumVGPRs(const Function &F) const; - unsigned getMaxNumAGPRs(const Function &F) const { - return getMaxNumVGPRs(F); + unsigned getMaxNumAGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMaxNumAGPRs(this, WavesPerEU); } /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number @@ -1744,13 +1744,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool supportsWave64() const { return !hasGFX1250Insts(); } - bool isWave32() const { - return getWavefrontSize() == 32; - } + bool isWave32() const { return getWavefrontSize() == 32; } - bool isWave64() const { - return getWavefrontSize() == 64; - } + bool isWave64() const { return getWavefrontSize() == 64; } /// Returns if the wavesize of this subtarget is known reliable. This is false /// only for the a default target-cpu that does not have an explicit diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0a0b02c18c1db..d78106694f2e8 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1494,6 +1494,22 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, return std::min(MaxNumVGPRs, AddressableNumVGPRs); } +unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned int WavesPerEU) { + if (!STI->getFeatureBits().test(FeatureMAIInsts)) + return 0; + + assert(WavesPerEU != 0); + + assert(!STI->getFeatureBits().test(FeatureDynamicVGPR)); + + unsigned MaxNumAGPRs = + alignTo(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI, 0)); + unsigned AddressableNumAGPRs = getAddressableNumArchVGPRs(STI); + return std::min(MaxNumAGPRs, AddressableNumAGPRs); +} + +unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI) { return 256; } + unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional EnableWavefrontSize32) { return getGranulatedNumRegisterBlocks( diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 23ea3ba0c8385..ecf7faac89ce5 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -353,6 +353,13 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule, unsigned MaxWaves, unsigned TotalNumVGPRs); +/// \returns Maximum number of AGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p STI. +unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); + +/// \returns Addressable number of AGPRs for a given subtarget \p STI. +unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI); + /// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p /// Gen. unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index b67080bd4798d..7c58791281562 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -149,55 +149,55 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-LABEL: add_v5i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0 +; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0 +; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 2, v2 +; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 4, v2 +; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 8, v2 +; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17] +; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 6, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v12, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v13, v[6:7] -; GFX8-NEXT: flat_load_ushort v14, v[8:9] -; GFX8-NEXT: flat_load_ushort v15, v[10:11] -; GFX8-NEXT: flat_load_ushort v16, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v17, v[2:3] -; GFX8-NEXT: flat_load_ushort v18, v[0:1] -; GFX8-NEXT: flat_load_ushort v19, v[6:7] -; GFX8-NEXT: flat_load_ushort v20, v[8:9] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7] +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: flat_load_ushort v20, v[19:20] +; GFX8-NEXT: flat_load_ushort v22, v[6:7] +; GFX8-NEXT: v_addc_u32_e64 v13, s[6:7], 0, v3, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v15, s[6:7], 0, v3, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v3, s[14:15] +; GFX8-NEXT: flat_load_ushort v23, v[8:9] ; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v3, s[12:13] +; GFX8-NEXT: flat_load_ushort v11, v[2:3] +; GFX8-NEXT: flat_load_ushort v12, v[12:13] +; GFX8-NEXT: flat_load_ushort v13, v[14:15] +; GFX8-NEXT: flat_load_ushort v14, v[16:17] +; GFX8-NEXT: flat_load_ushort v15, v[18:19] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 4, v4 +; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 6, v4 +; GFX8-NEXT: v_add_u32_e64 v2, s[8:9], 8, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v5, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v5, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v3, vcc, 0, v5, s[8:9] ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u16_e32 v11, v12, v17 +; GFX8-NEXT: v_add_u16_e32 v11, v21, v11 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v12, v13, v18 +; GFX8-NEXT: v_add_u16_e32 v12, v20, v12 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v19 +; GFX8-NEXT: v_add_u16_e32 v13, v22, v13 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v14, v15, v20 +; GFX8-NEXT: v_add_u16_e32 v14, v23, v14 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v10, v16, v10 +; GFX8-NEXT: v_add_u16_e32 v10, v10, v15 ; GFX8-NEXT: flat_store_short v[4:5], v11 ; GFX8-NEXT: flat_store_short v[0:1], v12 -; GFX8-NEXT: flat_store_short v[2:3], v13 -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v10 +; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: flat_store_short v[8:9], v14 +; GFX8-NEXT: flat_store_short v[2:3], v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -341,77 +341,77 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-LABEL: addv_7i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0 +; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0 +; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 10, v0 +; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 12, v0 +; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0 +; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 2, v2 +; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 4, v2 +; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17] ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v16, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v17, v[6:7] -; GFX8-NEXT: flat_load_ushort v18, v[8:9] -; GFX8-NEXT: flat_load_ushort v19, v[10:11] -; GFX8-NEXT: flat_load_ushort v20, v[12:13] -; GFX8-NEXT: flat_load_ushort v21, v[14:15] -; GFX8-NEXT: flat_load_ushort v22, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v13, vcc, 0, v1, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v15, vcc, 0, v1, s[10:11] +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2 +; GFX8-NEXT: flat_load_ushort v20, v[19:20] +; GFX8-NEXT: flat_load_ushort v22, v[6:7] +; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v3, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v19, s[10:11], 0, v3, s[14:15] +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 8, v2 +; GFX8-NEXT: flat_load_ushort v23, v[8:9] +; GFX8-NEXT: flat_load_ushort v24, v[10:11] +; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 10, v2 +; GFX8-NEXT: flat_load_ushort v12, v[12:13] +; GFX8-NEXT: flat_load_ushort v13, v[14:15] +; GFX8-NEXT: v_add_u32_e64 v10, s[8:9], 12, v2 +; GFX8-NEXT: flat_load_ushort v14, v[16:17] +; GFX8-NEXT: flat_load_ushort v15, v[18:19] ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v2 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 12, v2 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: flat_load_ushort v6, v[6:7] -; GFX8-NEXT: flat_load_ushort v7, v[8:9] -; GFX8-NEXT: flat_load_ushort v8, v[10:11] -; GFX8-NEXT: flat_load_ushort v9, v[12:13] -; GFX8-NEXT: flat_load_ushort v10, v[14:15] +; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v3, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v3, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v3, s[8:9] +; GFX8-NEXT: flat_load_ushort v3, v[6:7] +; GFX8-NEXT: flat_load_ushort v6, v[8:9] +; GFX8-NEXT: flat_load_ushort v7, v[10:11] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v2, v16, v2 +; GFX8-NEXT: v_add_u16_e32 v8, v20, v14 ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u16_e32 v3, v17, v3 +; GFX8-NEXT: v_add_u16_e32 v9, v22, v15 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u16_e32 v2, v21, v2 ; GFX8-NEXT: flat_store_short v[4:5], v2 -; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: flat_store_short v[0:1], v8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v6, v18, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v6 +; GFX8-NEXT: flat_store_short v[0:1], v9 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v7, v19, v7 +; GFX8-NEXT: v_add_u16_e32 v10, v23, v16 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v7 +; GFX8-NEXT: flat_store_short v[0:1], v10 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v8, v20, v8 +; GFX8-NEXT: v_add_u16_e32 v3, v24, v3 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v9, v21, v9 +; GFX8-NEXT: v_add_u16_e32 v6, v12, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v9 +; GFX8-NEXT: flat_store_short v[0:1], v6 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v10, v22, v10 +; GFX8-NEXT: v_add_u16_e32 v7, v13, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v10 +; GFX8-NEXT: flat_store_short v[0:1], v7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -511,31 +511,31 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v14, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v1, s[4:5] +; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX8-NEXT: v_add_u16_e32 v3, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v1, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX8-NEXT: v_or_b32_e32 v8, v3, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v9, v2, v9 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: flat_store_short v[14:15], v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -602,33 +602,33 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v14, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v15, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX8-NEXT: v_add_u16_e32 v3, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX8-NEXT: v_add_u16_e32 v3, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v1, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v9, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v6, v14, v15 -; GFX8-NEXT: v_add_u16_sdwa v7, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_add_u16_e32 v1, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX8-NEXT: v_or_b32_e32 v8, v11, v8 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_dword v[0:1], v6 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -663,53 +663,53 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v14, v[14:15] -; GFX8-NEXT: flat_load_ushort v15, v[16:17] -; GFX8-NEXT: flat_load_ushort v16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], 18, v2 +; GFX8-NEXT: v_add_u32_e64 v18, s[8:9], 16, v2 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v0 +; GFX8-NEXT: v_add_u32_e64 v17, s[6:7], 20, v2 +; GFX8-NEXT: v_addc_u32_e64 v19, s[8:9], 0, v3, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v3, s[4:5] +; GFX8-NEXT: flat_load_ushort v20, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v18, s[4:5], 0, v3, s[6:7] +; GFX8-NEXT: flat_load_ushort v3, v[15:16] +; GFX8-NEXT: flat_load_ushort v21, v[17:18] +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e64 v16, s[4:5], 16, v0 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 20, v0 +; GFX8-NEXT: v_addc_u32_e64 v17, s[4:5], 0, v1, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 -; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[6:7] -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v1, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_add_u16_e32 v2, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v6, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_ushort v7, v[16:17] +; GFX8-NEXT: flat_load_ushort v8, v[14:15] +; GFX8-NEXT: flat_load_ushort v11, v[18:19] +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 -; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], 18, v4 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 +; GFX8-NEXT: v_add_u16_e32 v12, v7, v20 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: v_add_u16_e32 v8, v8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: v_add_u16_e32 v16, v11, v21 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v5, s[4:5] +; GFX8-NEXT: v_add_u16_e32 v3, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 20, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v15 -; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: flat_store_short v[6:7], v12 +; GFX8-NEXT: flat_store_short v[10:11], v8 +; GFX8-NEXT: flat_store_short v[14:15], v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -796,23 +796,23 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v16, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v16, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v7 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_add_u16_sdwa v3, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 206011adf0213..ac29374337c24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1855,12 +1855,14 @@ entry: define amdgpu_ps double @dyn_extract_v16f64_v_s(<16 x double> %vec, i32 inreg %sel) { ; GPRIDX-LABEL: dyn_extract_v16f64_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GPRIDX-NEXT: s_lshl_b32 s1, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(SRC0) ; GPRIDX-NEXT: v_mov_b32_e32 v32, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v32 +; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v0 ; GPRIDX-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index ea149cc2f4a9e..b29d9e1f86263 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -712,90 +712,87 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f64: @@ -945,90 +942,87 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f64_ulp25: @@ -1104,35 +1098,35 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_mov_b32_e32 v22, 0x3ff00000 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v15, v22 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[6:7] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], v[14:15] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[20:21], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64: @@ -1264,35 +1258,35 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_mov_b32_e32 v22, 0x3ff00000 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v15, v22 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[6:7] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], v[14:15] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[20:21], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_arcp: @@ -1491,35 +1485,35 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_mov_b32_e32 v22, 0x3ff00000 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v15, v22 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[6:7] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], v[14:15] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[20:21], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_ulp25: @@ -1725,90 +1719,87 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[4:5], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] -; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[10:11] +; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f64_arcp_ulp25: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..509d01d13cc74 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7446,273 +7446,273 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 ; GFX6-NEXT: v_not_b32_e32 v16, v16 -; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v25, v18, v21 +; GFX6-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 -; GFX6-NEXT: v_not_b32_e32 v25, 63 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX6-NEXT: v_not_b32_e32 v17, 63 +; GFX6-NEXT: v_or_b32_e32 v26, v19, v22 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[10:11], v18 +; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v16 +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v17 +; GFX6-NEXT: v_or_b32_e32 v21, v21, v18 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v19 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[0:1], v24 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 -; GFX6-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX6-NEXT: v_not_b32_e32 v8, v20 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v16, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v18, v25, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v16 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v21, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v22, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, v11, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 +; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[4:5], v16 ; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v21 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v18, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc +; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX6-NEXT: v_not_b32_e32 v12, v20 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v12 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v17 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[8:9], v18 +; GFX6-NEXT: v_lshl_b64 v[14:15], v[10:11], v14 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v18 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v19 +; GFX6-NEXT: v_or_b32_e32 v12, v12, v14 +; GFX6-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v17, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX8-NEXT: v_not_b32_e32 v16, v16 -; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v25, v18, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v25, 63 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX8-NEXT: v_not_b32_e32 v17, 63 +; GFX8-NEXT: v_or_b32_e32 v26, v19, v22 +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[21:22], v16, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v17 +; GFX8-NEXT: v_or_b32_e32 v21, v21, v18 +; GFX8-NEXT: v_or_b32_e32 v22, v22, v19 +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v24, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v25, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v16, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v21, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v22, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, v11, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17 +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX8-NEXT: v_not_b32_e32 v8, v20 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX8-NEXT: v_not_b32_e32 v12, v20 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v12 +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v17 +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v18, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[14:15], v14, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v18, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v19, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v12, v12, v14 +; GFX8-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v17, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v21, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 +; GFX9-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX9-NEXT: v_or_b32_e32 v22, v22, v24 ; GFX9-NEXT: v_not_b32_e32 v16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 -; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[10:11] +; GFX9-NEXT: v_sub_u32_e32 v26, 64, v24 +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v24, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v26, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e64 v19, v21, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v24 +; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v22, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_or_b32_e32 v10, v17, v11 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX9-NEXT: v_or_b32_e32 v0, v23, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v19, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc -; GFX9-NEXT: v_not_b32_e32 v8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v17, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v19, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX9-NEXT: v_not_b32_e32 v12, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v12 +; GFX9-NEXT: v_lshl_or_b32 v9, v14, 31, v9 +; GFX9-NEXT: v_sub_u32_e32 v14, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v18, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], v14, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v18, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v19, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v12, v12, v14 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v17, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..696df45842e22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -7486,184 +7486,184 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX6-NEXT: v_not_b32_e32 v0, v16 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25 -; GFX6-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 -; GFX6-NEXT: v_not_b32_e32 v26, 63 -; GFX6-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26 -; GFX6-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[18:19], v0 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v25 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v16 +; GFX6-NEXT: v_not_b32_e32 v17, 63 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[10:11], v0 +; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v16 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v25, v17 +; GFX6-NEXT: v_or_b32_e32 v23, v0, v23 +; GFX6-NEXT: v_or_b32_e32 v24, v1, v24 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[18:19], v26 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v25 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[18:19], v25 +; GFX6-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v16, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v19, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v16 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v21, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v23, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v22, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v24, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, v11, s[4:5] ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX6-NEXT: v_not_b32_e32 v4, v20 ; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26 +; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[8:9], v16 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v18, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26 -; GFX6-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v16, v8, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, v9, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v19 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GFX6-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX6-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], v19 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v17, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v11, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v16, v8 +; GFX6-NEXT: v_or_b32_e32 v7, v18, v9 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[18:19], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v16 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25 -; GFX8-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v26, 63 -; GFX8-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26 -; GFX8-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[18:19] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v16 +; GFX8-NEXT: v_not_b32_e32 v17, 63 +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v0, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v16, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v25, v17 +; GFX8-NEXT: v_or_b32_e32 v23, v0, v23 +; GFX8-NEXT: v_or_b32_e32 v24, v1, v24 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[18:19] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v25 +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v25, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v19, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v16, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v21, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v23, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v22, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v24, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, v11, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX8-NEXT: v_not_b32_e32 v4, v20 ; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26 +; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, v9, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v19, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15] +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v19, v17 +; GFX8-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX8-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v19, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v17, v6 -; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v11, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v7, v18, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: @@ -7674,87 +7674,87 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v16 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX9-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25 -; GFX9-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19 -; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v0 +; GFX9-NEXT: v_sub_u32_e32 v19, 64, v25 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v19, v[17:18] +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v25, v[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v25 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v25 +; GFX9-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v0, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v22, v24 +; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX9-NEXT: v_sub_u32_e32 v24, 64, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v16, v21, vcc +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v24, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v25 +; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v22 +; GFX9-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v24, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 +; GFX9-NEXT: v_or_b32_e32 v10, v19, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX9-NEXT: v_or_b32_e32 v0, v23, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v24, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v21, v8 +; GFX9-NEXT: v_or_b32_e32 v3, v18, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v20 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v19, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 -; GFX9-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v9, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v18, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15] +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 +; GFX9-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX9-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v19, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v18, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v17, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v11, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v8 +; GFX9-NEXT: v_or_b32_e32 v7, v17, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 3e1602625f197..eaaf95ddac0ba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -154,61 +154,61 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:8 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:20 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:24 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:28 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:36 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:40 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:52 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:56 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:60 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], 0 offset:64 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], 0 offset:80 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], 0 offset:68 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], 0 offset:72 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], 0 offset:76 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], 0 offset:80 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], 0 offset:84 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 offset:88 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:92 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], 0 offset:76 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], 0 offset:96 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], 0 offset:112 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], 0 offset:100 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], 0 offset:104 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], 0 offset:108 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], 0 offset:112 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], 0 offset:116 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], 0 offset:120 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], 0 offset:124 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], 0 offset:108 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], 0 offset:128 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], 0 offset:144 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], 0 offset:132 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], 0 offset:136 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], 0 offset:140 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], 0 offset:144 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], 0 offset:148 ; GCN-NEXT: buffer_load_dword v38, off, s[0:3], 0 offset:152 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], 0 offset:156 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], 0 offset:140 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], 0 offset:160 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], 0 offset:176 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], 0 offset:164 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], 0 offset:168 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], 0 offset:172 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], 0 offset:176 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], 0 offset:180 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], 0 offset:184 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], 0 offset:188 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], 0 offset:172 ; GCN-NEXT: buffer_load_dword v48, off, s[0:3], 0 offset:192 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], 0 offset:208 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], 0 offset:196 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], 0 offset:200 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], 0 offset:204 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], 0 offset:208 ; GCN-NEXT: buffer_load_dword v53, off, s[0:3], 0 offset:212 ; GCN-NEXT: buffer_load_dword v54, off, s[0:3], 0 offset:216 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], 0 offset:220 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], 0 offset:204 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], 0 offset:224 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], 0 offset:228 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], 0 offset:232 @@ -217,33 +217,26 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:244 ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:248 ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(60) +; GCN-NEXT: s_waitcnt vmcnt(56) ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21] -; GCN-NEXT: s_waitcnt vmcnt(57) ; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16 -; GCN-NEXT: s_waitcnt vmcnt(54) +; GCN-NEXT: s_waitcnt vmcnt(50) ; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32 -; GCN-NEXT: s_waitcnt vmcnt(51) ; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(48) +; GCN-NEXT: s_waitcnt vmcnt(44) ; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64 -; GCN-NEXT: s_waitcnt vmcnt(45) ; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80 -; GCN-NEXT: s_waitcnt vmcnt(42) +; GCN-NEXT: s_waitcnt vmcnt(38) ; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96 -; GCN-NEXT: s_waitcnt vmcnt(39) ; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112 -; GCN-NEXT: s_waitcnt vmcnt(36) +; GCN-NEXT: s_waitcnt vmcnt(32) ; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(33) ; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144 -; GCN-NEXT: s_waitcnt vmcnt(30) +; GCN-NEXT: s_waitcnt vmcnt(26) ; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160 -; GCN-NEXT: s_waitcnt vmcnt(27) ; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(24) +; GCN-NEXT: s_waitcnt vmcnt(20) ; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(21) ; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208 ; GCN-NEXT: s_waitcnt vmcnt(18) ; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index cae833b0d64e3..145645de7afb1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -3268,12 +3268,12 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v14, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v14, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v14, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v14, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v14, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v14, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[10:11] ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[12:13], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[12:13], v[6:9], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: @@ -3410,34 +3410,34 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX9-NEXT: s_andn2_b32 s1, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v8, v0, s2, v1 +; GFX9-NEXT: v_lshl_or_b32 v12, v0, s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 ; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 5 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 6 ; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm @@ -3598,28 +3598,28 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v12, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[12:13] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[14:15] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm @@ -3659,28 +3659,28 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v12, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NEXT: v_mov_b32_e32 v6, s22 ; GFX8-NEXT: v_mov_b32_e32 v7, s23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[12:13] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[14:15] ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm @@ -3890,7 +3890,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX9-NEXT: v_not_b32_e32 v1, v1 -; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v12, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 @@ -3900,18 +3900,18 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[12:13] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm @@ -3950,7 +3950,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 @@ -3960,18 +3960,18 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[12:13] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm @@ -4188,12 +4188,12 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[10:11] ; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[13:14], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[13:14], v[7:10], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: @@ -4237,12 +4237,12 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[10:11] ; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[7:10] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_s_v: @@ -4432,12 +4432,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[10:11] ; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[13:14], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[13:14], v[7:10], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 51d0b225b2a27..c9429fdefa399 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -701,51 +701,51 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off -; GPRIDX-NEXT: s_waitcnt vmcnt(0) +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v2 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v17, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v16, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v18, v1, s[4:5] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; @@ -901,54 +901,54 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] +; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 7, v0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[13:16], off +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v15, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v18, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v16, v18, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; @@ -1320,52 +1320,52 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[12:13] +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11] +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 7, v2 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v17, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v16, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v18, v1, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; @@ -1509,29 +1509,29 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v18, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v16 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v18, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 6, v16 ; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 -; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v18, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off @@ -1695,24 +1695,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 6, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off @@ -2420,24 +2420,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[0:1] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 6, v18 ; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v18 +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off @@ -4586,7 +4586,6 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 ; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 -; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4618,7 +4617,8 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 -; GPRIDX-NEXT: s_set_gpr_idx_on s33, gpr_idx(DST) +; GPRIDX-NEXT: s_lshl_b32 s0, s34, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off @@ -4886,7 +4886,6 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 ; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 -; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4918,7 +4917,8 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 -; GPRIDX-NEXT: s_set_gpr_idx_on s33, gpr_idx(DST) +; GPRIDX-NEXT: s_lshl_b32 s0, s34, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off @@ -5651,6 +5651,14 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 @@ -5662,43 +5670,35 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 -; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 -; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v11 -; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s3, v3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v9, v0, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s5, v5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v11, v0, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v3 +; GPRIDX-NEXT: v_readfirstlane_b32 s10, v4 +; GPRIDX-NEXT: v_readfirstlane_b32 s11, v5 ; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1 ; GPRIDX-NEXT: ; return to shader part epilog @@ -6209,17 +6209,17 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 @@ -6227,7 +6227,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 ; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 ; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 ; GPRIDX-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index e0016b0a5a64d..025f6da5ad04d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -16,166 +16,166 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 ; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc -; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 -; LOOP-NEXT: s_waitcnt expcnt(5) -; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: s_waitcnt expcnt(2) -; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: s_waitcnt expcnt(4) +; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:7 ; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: s_waitcnt expcnt(3) +; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:9 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:11 ; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16 -; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20 -; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24 -; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28 -; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_waitcnt vmcnt(14) -; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v29 -; LOOP-NEXT: v_or_b32_e32 v26, v6, v26 -; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; LOOP-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; LOOP-NEXT: v_or_b32_e32 v29, v6, v7 -; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v37 -; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v39 -; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v38 -; LOOP-NEXT: v_or_b32_e32 v31, v6, v36 -; LOOP-NEXT: v_or_b32_e32 v32, v7, v32 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v35 +; LOOP-NEXT: v_or_b32_e32 v28, v6, v28 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v37 +; LOOP-NEXT: v_lshlrev_b32_e32 v35, 16, v36 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4 +; LOOP-NEXT: v_or_b32_e32 v35, v7, v35 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; LOOP-NEXT: v_or_b32_e32 v36, v7, v38 ; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc ; LOOP-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v4 -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; LOOP-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; LOOP-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; LOOP-NEXT: s_waitcnt vmcnt(12) -; LOOP-NEXT: v_lshlrev_b32_e32 v21, 24, v21 -; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; LOOP-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; LOOP-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; LOOP-NEXT: s_waitcnt vmcnt(10) -; LOOP-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; LOOP-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; LOOP-NEXT: s_waitcnt vmcnt(8) -; LOOP-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; LOOP-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; LOOP-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; LOOP-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; LOOP-NEXT: s_waitcnt vmcnt(6) -; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; LOOP-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; LOOP-NEXT: s_waitcnt vmcnt(4) -; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; LOOP-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; LOOP-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; LOOP-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; LOOP-NEXT: s_waitcnt vmcnt(2) -; LOOP-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; LOOP-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v35, 24, v35 -; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 -; LOOP-NEXT: v_or_b32_e32 v11, v13, v12 -; LOOP-NEXT: v_or_b32_e32 v9, v15, v9 -; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 -; LOOP-NEXT: v_or_b32_e32 v10, v19, v10 -; LOOP-NEXT: v_or_b32_e32 v13, v21, v20 -; LOOP-NEXT: v_or_b32_e32 v14, v23, v14 -; LOOP-NEXT: v_or_b32_e32 v15, v25, v24 -; LOOP-NEXT: v_or_b32_e32 v16, v27, v18 -; LOOP-NEXT: v_or_b32_e32 v17, v30, v28 -; LOOP-NEXT: v_or_b32_e32 v18, v33, v22 -; LOOP-NEXT: v_or_b32_e32 v19, v35, v34 -; LOOP-NEXT: v_or_b32_e32 v20, v29, v26 -; LOOP-NEXT: v_or_b32_e32 v21, v32, v31 -; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 -; LOOP-NEXT: v_or_b32_e32 v9, v12, v9 -; LOOP-NEXT: v_or_b32_e32 v10, v13, v10 -; LOOP-NEXT: v_or_b32_e32 v11, v15, v14 -; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; LOOP-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; LOOP-NEXT: v_or_b32_e32 v10, v12, v10 +; LOOP-NEXT: v_or_b32_e32 v8, v13, v8 +; LOOP-NEXT: v_or_b32_e32 v12, v15, v14 +; LOOP-NEXT: v_or_b32_e32 v9, v17, v9 ; LOOP-NEXT: v_or_b32_e32 v13, v19, v18 -; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; LOOP-NEXT: v_bfe_u32 v15, v20, 8, 8 -; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 -; LOOP-NEXT: v_lshrrev_b32_e32 v16, 24, v20 -; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v21 -; LOOP-NEXT: v_bfe_u32 v18, v21, 8, 8 -; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: v_lshrrev_b32_e32 v19, 24, v21 -; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; LOOP-NEXT: v_or_b32_e32 v11, v21, v11 +; LOOP-NEXT: v_or_b32_e32 v14, v23, v22 +; LOOP-NEXT: v_or_b32_e32 v15, v25, v16 +; LOOP-NEXT: v_or_b32_e32 v16, v27, v26 +; LOOP-NEXT: v_or_b32_e32 v17, v29, v20 +; LOOP-NEXT: v_or_b32_e32 v18, v31, v30 +; LOOP-NEXT: v_or_b32_e32 v19, v32, v24 +; LOOP-NEXT: v_or_b32_e32 v20, v34, v33 +; LOOP-NEXT: v_or_b32_e32 v21, v35, v28 +; LOOP-NEXT: v_or_b32_e32 v10, v10, v36 +; LOOP-NEXT: v_or_b32_e32 v8, v12, v8 +; LOOP-NEXT: v_or_b32_e32 v9, v13, v9 +; LOOP-NEXT: v_or_b32_e32 v11, v14, v11 +; LOOP-NEXT: v_or_b32_e32 v12, v16, v15 +; LOOP-NEXT: v_or_b32_e32 v13, v18, v17 +; LOOP-NEXT: v_or_b32_e32 v14, v20, v19 +; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; LOOP-NEXT: v_bfe_u32 v16, v21, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; LOOP-NEXT: v_bfe_u32 v18, v10, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; LOOP-NEXT: v_bfe_u32 v20, v8, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; LOOP-NEXT: v_bfe_u32 v23, v9, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; LOOP-NEXT: v_bfe_u32 v25, v11, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; LOOP-NEXT: v_bfe_u32 v27, v12, 8, 8 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v21, 24, v21 +; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; LOOP-NEXT: v_bfe_u32 v29, v13, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:4 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_bfe_u32 v21, v8, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 +; LOOP-NEXT: v_bfe_u32 v31, v14, 8, 8 ; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:8 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; LOOP-NEXT: v_bfe_u32 v23, v9, 8, 8 ; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:12 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; LOOP-NEXT: v_bfe_u32 v25, v10, 8, 8 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:16 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; LOOP-NEXT: v_bfe_u32 v27, v11, 8, 8 -; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:16 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; LOOP-NEXT: v_bfe_u32 v29, v12, 8, 8 -; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:20 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; LOOP-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; LOOP-NEXT: v_bfe_u32 v31, v13, 8, 8 -; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:24 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v13, 24, v13 -; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:3 ; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[0:3], 0 addr64 offset:5 ; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:11 ; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[0:3], 0 addr64 offset:13 ; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[0:3], 0 addr64 offset:14 ; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:15 ; LOOP-NEXT: buffer_store_byte v25, v[6:7], s[0:3], 0 addr64 offset:17 ; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:19 ; LOOP-NEXT: buffer_store_byte v27, v[6:7], s[0:3], 0 addr64 offset:21 ; LOOP-NEXT: buffer_store_byte v26, v[6:7], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:23 ; LOOP-NEXT: buffer_store_byte v29, v[6:7], s[0:3], 0 addr64 offset:25 ; LOOP-NEXT: buffer_store_byte v28, v[6:7], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:27 ; LOOP-NEXT: buffer_store_byte v31, v[6:7], s[0:3], 0 addr64 offset:29 ; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 ; LOOP-NEXT: ; %bb.2: ; %memcpy-split ; LOOP-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7529364..29481f97a14fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2400,68 +2400,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX7-NEXT: v_mov_b32_e32 v22, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v11, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v14, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v4, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v13, v[20:21] +; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v22, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[20:21] +; GFX7-NEXT: v_mov_b32_e32 v22, v16 +; GFX7-NEXT: v_mov_b32_e32 v16, v17 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[20:21] +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v9, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v17, v23 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v8, v[18:19] +; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v21, v19 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v0, v11, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v21, v3, v12 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v1, v10, v[19:20] +; GFX7-NEXT: v_mul_lo_u32 v20, v2, v13 ; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v21, v20 -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], 0, v6, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v19, v18 +; GFX7-NEXT: v_mov_b32_e32 v18, v11 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v0, v9, v[18:19] +; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v6, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[16:17] ; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[18:19] +; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v6, v3, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v26, v4, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v25, v6, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v24, v0, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v20, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v21, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v23, vcc ; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2469,68 +2470,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX8-NEXT: v_mov_b32_e32 v22, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v11, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v14, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v4, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v13, v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v22, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[20:21] +; GFX8-NEXT: v_mov_b32_e32 v22, v16 +; GFX8-NEXT: v_mov_b32_e32 v16, v17 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[20:21] +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v9, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v17, v23 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v8, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v21, v19 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v0, v11, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v21, v3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v1, v10, v[19:20] +; GFX8-NEXT: v_mul_lo_u32 v20, v2, v13 ; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v21, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], 0, v6, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v19, v18 +; GFX8-NEXT: v_mov_b32_e32 v18, v11 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v0, v9, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v6, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[16:17] ; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v6, v3, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v26, v4, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v25, v6, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v24, v0, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v20, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v21, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v23, vcc ; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2538,68 +2540,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc -; GFX9-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v11, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v14, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v4, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v13, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v22, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[20:21] +; GFX9-NEXT: v_mov_b32_e32 v22, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[20:21] +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v9, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v8, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v0, v11, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v21, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v1, v10, v[19:20] +; GFX9-NEXT: v_mul_lo_u32 v20, v2, v13 ; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], 0, v6, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v19, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v11 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v0, v9, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v6, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[16:17] ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v6, v3, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v26, v4, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v25, v6, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v24, v0, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v20, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v21, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v23, vcc ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 832f066adaa84..b46177e876c98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1935,25 +1935,25 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v31, 1 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 -; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_max_i32_e32 v33, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 ; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v16, v33 ; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 +; GFX6-NEXT: v_min_i32_e32 v33, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v32 -; GFX6-NEXT: v_min_i32_e32 v32, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v33 +; GFX6-NEXT: v_max_i32_e32 v33, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 -; GFX6-NEXT: v_max_i32_e32 v32, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v16, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v16, v33 +; GFX6-NEXT: v_min_i32_e32 v33, 0, v2 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 +; GFX6-NEXT: v_max_i32_e32 v32, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v33 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v16, v32 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 -; GFX6-NEXT: v_max_i32_e32 v18, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 @@ -2018,32 +2018,32 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v12 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v18 ; GFX6-NEXT: v_max_i32_e32 v18, 0, v12 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v13 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v18 ; GFX6-NEXT: v_max_i32_e32 v18, 0, v13 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v20, 0, v14 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 ; GFX6-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v20 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v21, 0, v15 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v20, 0, v15 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v21 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v20 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2056,25 +2056,25 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v31, 1 ; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v32 -; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_max_i32_e32 v33, 0, v0 +; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_bfrev_b32_e32 v16, -2 ; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v16, v33 ; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 +; GFX8-NEXT: v_min_i32_e32 v33, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v32 -; GFX8-NEXT: v_min_i32_e32 v32, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v32 +; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v33 +; GFX8-NEXT: v_max_i32_e32 v33, 0, v1 ; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 -; GFX8-NEXT: v_max_i32_e32 v32, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v16, v32 +; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v16, v33 +; GFX8-NEXT: v_min_i32_e32 v33, 0, v2 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 +; GFX8-NEXT: v_max_i32_e32 v32, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v33 +; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v16, v32 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 -; GFX8-NEXT: v_max_i32_e32 v18, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 @@ -2139,32 +2139,32 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v12 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v18 ; GFX8-NEXT: v_max_i32_e32 v18, 0, v12 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v13 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v18 ; GFX8-NEXT: v_max_i32_e32 v18, 0, v13 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v20, 0, v14 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 ; GFX8-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v20 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v21, 0, v15 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v20, 0, v15 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v21 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v20 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index f57fc005b994b..634b1f49a5eff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -450,143 +450,143 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v5 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v19, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v13 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v19 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; GISEL-NEXT: v_mul_f32_e32 v19, 0x5f7ffffc, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v13, v12 +; GISEL-NEXT: v_mul_f32_e32 v20, 0x2f800000, v19 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v21, v[1:2] +; GISEL-NEXT: v_trunc_f32_e32 v1, v20 +; GISEL-NEXT: v_mac_f32_e32 v19, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v19 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v19, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v16, v12 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v10 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v17, v15, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v12, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v18, v19, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v20, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v13 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v22, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v20, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v16, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], 1, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, 0, v21, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v11, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v11, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v11 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v21, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v10 ; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -601,34 +601,34 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[3:4] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc @@ -1221,87 +1221,87 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v16, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v19, v16, v14 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v17, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v20, v1 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v20, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v20, v13 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v13, v14 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v20, v13 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v20, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v17 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v18, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v21, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1321,19 +1321,20 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v5, v9 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v9 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 ; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1357,9 +1358,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc @@ -1405,10 +1405,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1424,156 +1424,156 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v8, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mov_b32_e32 v8, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v20, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v17, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v0, v20, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v20, v1 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v20, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v15, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v1, v0 +; CGP-NEXT: v_mul_hi_u32 v13, v20, v13 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v13, v14 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v20, v13 +; CGP-NEXT: v_subb_u32_e64 v14, s[4:5], v20, v13, vcc +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v17 ; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_add_i32_e32 v20, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v18, vcc +; CGP-NEXT: v_mov_b32_e32 v0, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, -1, v13, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v1, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v13, -1, v19, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cndmask_b32_e32 v13, v15, v20, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v21, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v8 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v8 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 ; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1581,18 +1581,18 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1862,23 +1862,23 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v7, v12 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v11 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v17, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v19 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 @@ -1916,122 +1916,122 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v6 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v6, vcc ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v19, v10 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v13 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v19 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; GISEL-NEXT: v_mul_f32_e32 v19, 0x5f7ffffc, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v13, v12 +; GISEL-NEXT: v_mul_f32_e32 v20, 0x2f800000, v19 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v21, v[1:2] +; GISEL-NEXT: v_trunc_f32_e32 v1, v20 +; GISEL-NEXT: v_mac_f32_e32 v19, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v19 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v19, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v16, v12 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v17, v15, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v22, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v8, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v18, v19, v[11:12] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v16, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v11 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v22, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v20, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v20, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v14, v16, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], 1, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_addc_u32_e64 v8, vcc, 0, v21, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_lo_u32 v16, v15, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v20, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v19, v12 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v15, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v21, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2039,16 +2039,16 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -2057,7 +2057,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -2065,35 +2065,35 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v13, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v11, v[3:4] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591a5fcce..0c278ce355197 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1290,22 +1290,23 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s17, 31 -; GFX8-NEXT: s_ashr_i32 s6, s1, 31 +; GFX8-NEXT: s_ashr_i32 s8, s1, 31 ; GFX8-NEXT: s_add_u32 s10, s16, s4 ; GFX8-NEXT: s_addc_u32 s11, s17, s4 -; GFX8-NEXT: s_add_u32 s0, s0, s6 -; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: s_addc_u32 s1, s1, s6 -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX8-NEXT: s_add_u32 s0, s0, s8 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_addc_u32 s1, s1, s8 +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s16, 0, s8 -; GFX8-NEXT: s_subb_u32 s17, 0, s9 +; GFX8-NEXT: s_sub_u32 s16, 0, s6 +; GFX8-NEXT: s_subb_u32 s17, 0, s7 +; GFX8-NEXT: s_xor_b64 s[20:21], s[4:5], s[8:9] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1313,6 +1314,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX8-NEXT: s_ashr_i32 s8, s19, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 @@ -1346,9 +1349,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] -; GFX8-NEXT: s_ashr_i32 s6, s19, 31 -; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 @@ -1386,205 +1386,205 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v6, s11 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s18, s6 -; GFX8-NEXT: s_addc_u32 s1, s19, s6 +; GFX8-NEXT: s_add_u32 s0, s18, s8 +; GFX8-NEXT: s_addc_u32 s1, s19, s8 ; GFX8-NEXT: s_add_u32 s2, s2, s10 ; GFX8-NEXT: s_mov_b32 s11, s10 ; GFX8-NEXT: s_addc_u32 s3, s3, s10 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 +; GFX8-NEXT: s_xor_b64 s[16:17], s[2:3], s[10:11] +; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s17 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s16 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s6, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v11, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: s_sub_u32 s5, 0, s16 +; GFX8-NEXT: s_subb_u32 s18, 0, s17 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v15, vcc ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s5, v5, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v8, v5, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, s20, v4 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s18, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_mul_lo_u32 v9, v12, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_mul_hi_u32 v9, v12, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v8, 0 +; GFX8-NEXT: v_xor_b32_e32 v9, s21, v3 +; GFX8-NEXT: v_mov_b32_e32 v11, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v5, v[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s20, v4 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v8, v[2:3] +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v7 +; GFX8-NEXT: v_mul_lo_u32 v4, v5, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, v2 +; GFX8-NEXT: v_mul_hi_u32 v12, v8, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, s21 +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v12, v5, v2 +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v8, v2 +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v12, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v12, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 -; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v4 +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v4 +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v8, v1 +; GFX8-NEXT: v_addc_u32_e64 v2, s[0:1], v5, v2, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v5, s7, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s6, v1 +; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], v5, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], v5, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] +; GFX8-NEXT: v_mul_lo_u32 v5, s7, v2 +; GFX8-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], v7, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s6, v2 +; GFX8-NEXT: v_add_u32_e64 v1, s[2:3], v5, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX8-NEXT: v_add_u32_e64 v1, s[2:3], v1, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] +; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], v5, v7 +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], v1, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s7, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s16, v7, 0 +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], v5, v3 +; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], v8, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s16, v8, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: v_mov_b32_e32 v12, s7 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s17, v7, v[2:3] +; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s6, v1 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], s7, v2 +; GFX8-NEXT: v_subb_u32_e64 v12, s[4:5], v12, v2, s[2:3] +; GFX8-NEXT: v_subb_u32_e64 v13, s[2:3], v1, v5, s[2:3] +; GFX8-NEXT: v_cmp_le_u32_e64 s[4:5], s17, v12 +; GFX8-NEXT: v_subrev_u32_e64 v14, s[2:3], s16, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GFX8-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v13, s[2:3] +; GFX8-NEXT: v_cmp_le_u32_e64 s[4:5], s17, v15 +; GFX8-NEXT: v_cmp_le_u32_e64 s[6:7], s16, v3 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s17, v12 +; GFX8-NEXT: v_cmp_le_u32_e64 s[4:5], s16, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s17, v15 +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], 1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX8-NEXT: v_addc_u32_e64 v16, vcc, 0, v8, s[4:5] +; GFX8-NEXT: v_subb_u32_e64 v5, vcc, v13, v5, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc +; GFX8-NEXT: v_subrev_u32_e64 v10, s[2:3], s16, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; GFX8-NEXT: v_subbrev_u32_e64 v16, s[2:3], 0, v5, s[2:3] +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v6, v11, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[0:1] +; GFX8-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s1, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s8, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, s8, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 @@ -1601,22 +1601,23 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s17, 31 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 ; GFX9-NEXT: s_add_u32 s10, s16, s4 ; GFX9-NEXT: s_addc_u32 s11, s17, s4 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s6 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s16, 0, s8 -; GFX9-NEXT: s_subb_u32 s17, 0, s9 +; GFX9-NEXT: s_sub_u32 s16, 0, s6 +; GFX9-NEXT: s_subb_u32 s17, 0, s7 +; GFX9-NEXT: s_xor_b64 s[20:21], s[4:5], s[8:9] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1624,6 +1625,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX9-NEXT: s_ashr_i32 s8, s19, 31 +; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 @@ -1656,9 +1659,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] -; GFX9-NEXT: s_ashr_i32 s6, s19, 31 -; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1694,50 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc ; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s18, s6 -; GFX9-NEXT: s_addc_u32 s1, s19, s6 +; GFX9-NEXT: s_add_u32 s0, s18, s8 +; GFX9-NEXT: s_addc_u32 s1, s19, s8 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 +; GFX9-NEXT: s_xor_b64 s[16:17], s[2:3], s[10:11] +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s16 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1745,155 +1745,155 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX9-NEXT: s_sub_u32 s5, 0, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX9-NEXT: s_sub_u32 s5, 0, s16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 -; GFX9-NEXT: s_subb_u32 s20, 0, s3 +; GFX9-NEXT: s_subb_u32 s18, 0, s17 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s18, v17, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, v17, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 ; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v17, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 +; GFX9-NEXT: v_add_u32_e32 v4, v10, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] +; GFX9-NEXT: v_add3_u32 v1, v4, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v17, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s5, v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v5, s20, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s20, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v4, v[2:3] +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v10, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v4, v2 +; GFX9-NEXT: v_mul_hi_u32 v11, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v10, v1 ; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v11, v10, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v10, v2 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v11, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 +; GFX9-NEXT: v_add_u32_e32 v7, v11, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v2, v7, v5, v2 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v4, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], v10, v2, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v5, s6, v2 +; GFX9-NEXT: v_mul_hi_u32 v10, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX9-NEXT: v_mul_hi_u32 v13, s7, v2 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v4, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v10, s7, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v10, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v11, s[0:1], v1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v11, 0 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v10, v5 +; GFX9-NEXT: v_add3_u32 v10, v3, v12, v13 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s16, v10, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s17, v11, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e64 v3, s[2:3], s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v12, s[4:5], v12, v2, s[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v13, s[2:3], v1, v5, s[2:3] +; GFX9-NEXT: v_cmp_le_u32_e64 s[4:5], s17, v12 +; GFX9-NEXT: v_subrev_co_u32_e64 v14, s[2:3], s16, v3 +; GFX9-NEXT: v_xor_b32_e32 v8, s21, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GFX9-NEXT: v_subbrev_co_u32_e64 v15, s[4:5], 0, v13, s[2:3] +; GFX9-NEXT: v_cmp_le_u32_e64 s[4:5], s17, v15 +; GFX9-NEXT: v_cmp_le_u32_e64 s[6:7], s16, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s17, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[4:5], s16, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s17, v15 +; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 1, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v16, vcc, 0, v10, s[4:5] +; GFX9-NEXT: v_subb_co_u32_e64 v5, vcc, v13, v5, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, 1, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v16, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v16, s[2:3], s16, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[2:3], 0, v5, s[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v6, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v11, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc +; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v14, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v2, s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v8, s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v3, s1, v7 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s8, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_xor_b32_e32 v7, s8, v10 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s8, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 8d8eca162257a..24c8dfd0154d3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -481,79 +481,79 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_trunc_f32_e32 v12, v10 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v14, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v18, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v12 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], v14, v9 ; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, v[0:1] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] +; GISEL-NEXT: v_mul_lo_u32 v1, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v14 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v19, v12, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v19, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v19, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v20, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v19, v10 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v11, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 ; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -561,18 +561,18 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 +; GISEL-NEXT: v_xor_b32_e32 v10, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 @@ -1147,85 +1147,85 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v16, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v19, v16, v14 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v17, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v20, v1 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v16, v20, v0 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v20, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v1, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v20, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1346,154 +1346,154 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v8, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mov_b32_e32 v8, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v20, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v17, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v20, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v20, v1 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v20, v0 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v15, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v20, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v20, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mul_lo_u32 v19, v7, v0 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v8 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v8 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 ; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1501,18 +1501,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1740,85 +1740,85 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v16, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v19, v16, v14 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v17, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v20, v1 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v16, v20, v0 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v20, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v1, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v20, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1939,154 +1939,154 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v8, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mov_b32_e32 v8, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v20, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v19 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v4 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v17, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v20, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v20, v1 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v20, v0 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v15, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v20, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v20, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mul_lo_u32 v19, v7, v0 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v8 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v8 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 ; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -2094,18 +2094,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -2465,123 +2465,123 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_trunc_f32_e32 v13, v10 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 ; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v15, v14, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 ; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, v19, v12, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v7 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v19, v19, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v19, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v20, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v19, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v11, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v7, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v15, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 ; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_xor_b32_e32 v11, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5bae..f538ec1919608 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1938,25 +1938,25 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v31, 0x80000001 ; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 -; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_min_i32_e32 v33, -1, v0 +; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX6-NEXT: v_add_i32_e32 v33, vcc, v33, v16 ; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 +; GFX6-NEXT: v_max_i32_e32 v33, -1, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v32 -; GFX6-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v33, v31 +; GFX6-NEXT: v_min_i32_e32 v33, -1, v1 ; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 -; GFX6-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v16 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v33, v16 +; GFX6-NEXT: v_max_i32_e32 v33, -1, v2 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 +; GFX6-NEXT: v_min_i32_e32 v32, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v33, v31 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 @@ -2021,32 +2021,32 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v18, -1, v12 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v18, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v18, -1, v13 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v18, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_max_i32_e32 v20, -1, v14 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v14 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v20, v31 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_max_i32_e32 v20, -1, v15 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_min_i32_e32 v21, -1, v15 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v15 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v20, v31 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v21, v16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2059,25 +2059,25 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX8-NEXT: v_mov_b32_e32 v31, 0x80000001 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 -; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_min_i32_e32 v33, -1, v0 +; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 +; GFX8-NEXT: v_max_i32_e32 v33, -1, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v32 -; GFX8-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v31 +; GFX8-NEXT: v_min_i32_e32 v33, -1, v1 ; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 -; GFX8-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v16 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v16 +; GFX8-NEXT: v_max_i32_e32 v33, -1, v2 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 +; GFX8-NEXT: v_min_i32_e32 v32, -1, v2 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v33, v31 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 -; GFX8-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 @@ -2142,32 +2142,32 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v18, -1, v12 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v18, -1, v13 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_max_i32_e32 v20, -1, v14 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v14 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v20, v31 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_max_i32_e32 v20, -1, v15 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_min_i32_e32 v21, -1, v15 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v18, -1, v15 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v20, v31 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v21, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index d9158e3558395..9c525d2216102 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -1573,12 +1573,12 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_uaddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_not_b32_e32 v31, v0 -; GFX6-NEXT: v_min_u32_e32 v16, v31, v16 ; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX6-NEXT: v_not_b32_e32 v32, v0 +; GFX6-NEXT: v_not_b32_e32 v33, v1 +; GFX6-NEXT: v_min_u32_e32 v16, v32, v16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v1 -; GFX6-NEXT: v_min_u32_e32 v16, v16, v17 +; GFX6-NEXT: v_min_u32_e32 v16, v33, v17 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v2 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v18 @@ -1607,19 +1607,19 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_not_b32_e32 v16, v10 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v26 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v11 -; GFX6-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v12 +; GFX6-NEXT: v_not_b32_e32 v18, v13 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v28 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v13 -; GFX6-NEXT: v_min_u32_e32 v16, v16, v29 +; GFX6-NEXT: v_min_u32_e32 v16, v18, v29 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v14 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v30 +; GFX6-NEXT: v_not_b32_e32 v17, v11 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v15 +; GFX6-NEXT: v_min_u32_e32 v17, v17, v27 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v16, v31 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 4de10788a6bd7..93645bbdb8691 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1924,220 +1924,220 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v21, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v21 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 ; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v22, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 -; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v1, v5 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], 1, v7 +; GISEL-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v5, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v21, s[4:5], 1, v19 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, 0, v20, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v14, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 -; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] -; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_sub_i32_e64 v8, s[8:9], 0, v8 +; GISEL-NEXT: v_sub_i32_e64 v9, s[8:9], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v18, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v15, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v19, v21, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v20, v13, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index ba5a8e9c68a1f..4625ba2a0d0a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -993,18 +993,19 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_i32 s12, s12, s17 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x20 ; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 -; GFX8-NEXT: s_sub_u32 s2, 0, s12 -; GFX8-NEXT: s_subb_u32 s3, 0, s13 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s17 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s16 +; GFX8-NEXT: s_sub_u32 s2, 0, s16 +; GFX8-NEXT: s_subb_u32 s3, 0, s17 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1036,244 +1037,245 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v4, v[0:1] +; GFX8-NEXT: v_mul_hi_u32 v7, v5, v1 +; GFX8-NEXT: s_sub_u32 s2, 0, s18 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v5, v[2:3] +; GFX8-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mul_lo_u32 v6, v5, v2 +; GFX8-NEXT: s_subb_u32 s3, 0, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_mul_hi_u32 v6, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc -; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX8-NEXT: v_trunc_f32_e32 v3, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3 -; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s12, v2 +; GFX8-NEXT: v_mul_hi_u32 v5, s12, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s19 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2 +; GFX8-NEXT: v_mul_lo_u32 v5, s13, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, s12, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s18 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f800000, v6 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v5 +; GFX8-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX8-NEXT: v_trunc_f32_e32 v5, v5 +; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v5 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s16, v1, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3 -; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v9, 0 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v11, v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v9, v[7:8] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s16, v2, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v11, v3 +; GFX8-NEXT: v_mul_lo_u32 v12, v9, v6 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s17, v1, v[7:8] +; GFX8-NEXT: v_mul_hi_u32 v8, v9, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, v11, v6 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; GFX8-NEXT: v_mul_hi_u32 v12, v9, v6 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v12 +; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v3 +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s12, v5 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v11, v[4:5] +; GFX8-NEXT: v_subb_u32_e64 v3, s[0:1], v10, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s13, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v5, v11, v8 +; GFX8-NEXT: v_mul_hi_u32 v14, v6, v8 +; GFX8-NEXT: v_mul_lo_u32 v13, v6, v4 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v0, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, v11, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v14, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v13, v5 +; GFX8-NEXT: v_mul_hi_u32 v13, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v14, v13 +; GFX8-NEXT: v_mul_hi_u32 v4, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3 -; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7 -; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1] -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v13, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v4, s15, v5 +; GFX8-NEXT: v_mul_lo_u32 v8, s14, v6 +; GFX8-NEXT: v_mul_hi_u32 v14, s14, v5 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v12 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s16, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX8-NEXT: v_subbrev_u32_e64 v13, s[0:1], 0, v7, vcc +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v14, s15, v6 +; GFX8-NEXT: v_mul_hi_u32 v5, s15, v5 +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v8, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, s14, v6 +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v14, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v15, s[0:1], v5, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s18, v15, 0 +; GFX8-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v13 +; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v3 +; GFX8-NEXT: v_cmp_le_u32_e64 s[4:5], s16, v11 +; GFX8-NEXT: v_mul_hi_u32 v6, s15, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v13 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[2:3] +; GFX8-NEXT: v_add_u32_e64 v19, s[0:1], 1, v17 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8 +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v16 +; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v6, v8 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s18, v8, v[5:6] +; GFX8-NEXT: v_mov_b32_e32 v16, s19 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s16, v11 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s19, v15, v[5:6] +; GFX8-NEXT: v_sub_u32_e64 v20, s[0:1], s14, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: v_cmp_le_u32_e64 s[4:5], s18, v20 +; GFX8-NEXT: v_subb_u32_e64 v21, s[2:3], v6, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], s15, v5 +; GFX8-NEXT: v_cmp_le_u32_e64 s[2:3], s19, v21 +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v16, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[2:3] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v18 +; GFX8-NEXT: v_subrev_u32_e64 v18, s[0:1], s18, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v19, s[2:3] +; GFX8-NEXT: v_subbrev_u32_e64 v19, s[4:5], 0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[2:3] +; GFX8-NEXT: v_add_u32_e64 v22, s[4:5], 1, v15 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v17, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v10, s[6:7] +; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v7, vcc +; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v8, s[4:5] +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s19, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v14, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v13, v2, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s19, v21 +; GFX8-NEXT: v_cmp_le_u32_e64 s[2:3], s18, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s19, v19 +; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], 1, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v7, s[2:3] +; GFX8-NEXT: v_subb_u32_e64 v5, vcc, v5, v16, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX8-NEXT: v_subrev_u32_e64 v10, s[0:1], s18, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v22, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v3, v2, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v15, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v7, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v11, vcc +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v20, v6, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v21, v7, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x20 ; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX9-NEXT: s_sub_u32 s2, 0, s4 -; GFX9-NEXT: s_subb_u32 s3, 0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s21 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s20 +; GFX9-NEXT: s_sub_u32 s2, 0, s20 +; GFX9-NEXT: s_subb_u32 s3, 0, s21 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1304,220 +1306,220 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v4, v[0:1] +; GFX9-NEXT: v_mul_hi_u32 v7, v5, v1 +; GFX9-NEXT: s_sub_u32 s2, 0, s22 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v5, v[2:3] +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v2 +; GFX9-NEXT: s_subb_u32 s3, 0, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s21 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v6, v3, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s16, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s17, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s23 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, s17, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, s16, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f800000, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s19, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v12, s19, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s22 +; GFX9-NEXT: v_mul_hi_u32 v3, s17, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v0, v10, v8 -; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1] -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v1 +; GFX9-NEXT: v_add3_u32 v3, v4, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v2, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v6 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v7, v6 +; GFX9-NEXT: v_mul_f32_e32 v6, 0xcf800000, v7 +; GFX9-NEXT: v_add_f32_e32 v1, v6, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v7 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s20, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v12, 0 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s16, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], s2, v13, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s21, v2, v[7:8] +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v12, v[9:10] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v11, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v8, s17, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v12, v7 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s21, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v11, v13, v7 +; GFX9-NEXT: v_mul_hi_u32 v5, v13, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v12, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v7 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v11, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 +; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v6, v9, v6, v7 +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v12, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], v13, v6, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v9, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s20, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v9, v[6:7] +; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s20, v4 +; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v5 +; GFX9-NEXT: v_mul_lo_u32 v17, v9, v6 +; GFX9-NEXT: v_add_co_u32_e64 v15, s[0:1], 1, v2 +; GFX9-NEXT: v_mul_hi_u32 v19, v9, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s21, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v7, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v7, v19 +; GFX9-NEXT: v_mul_lo_u32 v19, v11, v6 +; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX9-NEXT: v_mul_hi_u32 v20, v9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v19, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v20 +; GFX9-NEXT: v_add_u32_e32 v7, v17, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_add_u32_e32 v17, v19, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add3_u32 v6, v17, v7, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v9, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v11, v6, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v6, s19, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v17, s18, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s20, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v17, s19, v7 +; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v17, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v19, s[0:1], v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s22, v19, 0 +; GFX9-NEXT: v_add_u32_e32 v9, v17, v9 +; GFX9-NEXT: v_add3_u32 v9, v9, v20, v7 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s22, v9, v[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v8, v0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s23, v19, v[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s21, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s21, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_sub_co_u32_e32 v18, vcc, s18, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_subb_co_u32_e64 v20, s[2:3], v7, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, s19, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s23, v20 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v12, vcc +; GFX9-NEXT: v_add_co_u32_e64 v21, s[0:1], 1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[4:5], s22, v18 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s22, v18 +; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v16, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v21, s[2:3] +; GFX9-NEXT: v_subbrev_co_u32_e64 v21, s[4:5], 0, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s20, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[4:5], 1, v19 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10 +; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v9, s[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s23, v21 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v13, v8, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v14, v0, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s23, v20 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s22, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s23, v21 +; GFX9-NEXT: v_add_co_u32_e64 v7, s[2:3], 1, v17 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v6, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v10, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v10 -; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v4, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v10, s[0:1], s22, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v5 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, v0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v8, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v21, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v18, v0, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v20, v1, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v13, v[2:5], s[12:13] ; GFX9-NEXT: global_store_dwordx4 v13, v[6:9], s[14:15] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index a41ec8e7ce3ea..0c57ec0bccb0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1922,216 +1922,216 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v21, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v21 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 ; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v22, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 -; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 ; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 1fd139b06417f..f9b2c8fa1bbb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1526,36 +1526,36 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_min_u32_e32 v16, v0, v16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v1, v17 +; GFX6-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v2, v18 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 +; GFX6-NEXT: v_min_u32_e32 v18, v4, v20 +; GFX6-NEXT: v_min_u32_e32 v19, v5, v21 +; GFX6-NEXT: v_min_u32_e32 v20, v6, v22 +; GFX6-NEXT: v_min_u32_e32 v21, v7, v23 +; GFX6-NEXT: v_min_u32_e32 v22, v8, v24 +; GFX6-NEXT: v_min_u32_e32 v23, v9, v25 +; GFX6-NEXT: v_min_u32_e32 v24, v10, v26 +; GFX6-NEXT: v_min_u32_e32 v25, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v26, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v27, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v28, v14, v30 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 -; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 -; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 -; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 -; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 -; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 -; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 -; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 -; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 -; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 -; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v19 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v20 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v21 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v22 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v23 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v24 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v25 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v26 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v27 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v28 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v15, v17 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll index 7633ba0eb4f9c..7fe14fdfa5b31 100644 --- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll @@ -837,41 +837,41 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_sub_u16_sdwa v15, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v15, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v21 ; GFX8-NEXT: v_sub_u16_sdwa v9, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v10, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v11, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v12, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v14, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 -; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_sub_u16_e32 v8, 0, v1 -; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v1, v1, v8 +; GFX8-NEXT: v_sub_u16_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7 ; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6 ; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 ; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_sub_u16_e32 v15, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_sub_u16_e32 v15, 0, v1 +; GFX8-NEXT: v_sub_u16_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_sdwa v14, v1, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v1, v1, v15 ; GFX8-NEXT: v_max_i16_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v12, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v13, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v14, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v2, v2, v15 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v8 +; GFX8-NEXT: v_max_i16_sdwa v8, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v2, v2, v21 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v20 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 ; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v17 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v16 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v12 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 @@ -974,44 +974,36 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX6-LABEL: v_abs_v32i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v28 ; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v28, v28, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 ; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v29, v29, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 -; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v30, v30, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26 +; GFX6-NEXT: v_sub_i32_e32 v33, vcc, 0, v29 +; GFX6-NEXT: v_max_i32_e32 v28, v28, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v30 ; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v26, v26, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27 -; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v27, v27, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 +; GFX6-NEXT: v_max_i32_e32 v33, v29, v33 +; GFX6-NEXT: v_max_i32_e32 v29, v30, v32 +; GFX6-NEXT: v_sub_i32_e32 v30, vcc, 0, v27 ; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v24, v24, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 -; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v25, v25, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22 +; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v27, v27, v30 +; GFX6-NEXT: v_sub_i32_e32 v30, vcc, 0, v25 ; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v22, v22, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23 -; GFX6-NEXT: v_max_i32_e32 v23, v23, v31 -; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v24 +; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v25, v25, v30 +; GFX6-NEXT: v_sub_i32_e32 v30, vcc, 0, v23 +; GFX6-NEXT: v_max_i32_e32 v24, v24, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 +; GFX6-NEXT: v_max_i32_e32 v23, v23, v30 +; GFX6-NEXT: v_max_i32_e32 v22, v22, v32 ; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX6-NEXT: v_or_b32_e32 v22, v22, v23 ; GFX6-NEXT: v_or_b32_e32 v24, v24, v25 ; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX6-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v29 ; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16 ; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16 @@ -1032,79 +1024,87 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v20 +; GFX6-NEXT: v_sub_i32_e32 v34, vcc, 0, v26 +; GFX6-NEXT: v_max_i32_e32 v20, v20, v32 +; GFX6-NEXT: v_max_i32_e32 v26, v26, v34 +; GFX6-NEXT: v_lshlrev_b32_e32 v30, 16, v33 ; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX6-NEXT: v_or_b32_e32 v28, v28, v30 ; GFX6-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23 ; GFX6-NEXT: v_max_i32_e32 v23, v23, v25 -; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX6-NEXT: v_or_b32_e32 v30, v30, v23 -; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21 -; GFX6-NEXT: v_max_i32_e32 v21, v21, v23 -; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX6-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18 -; GFX6-NEXT: v_max_i32_e32 v18, v18, v21 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v21 +; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v21 +; GFX6-NEXT: v_max_i32_e32 v21, v21, v25 +; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v18 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v25 +; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v19 +; GFX6-NEXT: v_max_i32_e32 v19, v19, v25 ; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX6-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX6-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v14, v14, v17 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15 -; GFX6-NEXT: v_max_i32_e32 v15, v15, v17 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v14 +; GFX6-NEXT: v_max_i32_e32 v14, v14, v19 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v15 +; GFX6-NEXT: v_max_i32_e32 v15, v15, v19 ; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX6-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 ; GFX6-NEXT: v_max_i32_e32 v12, v12, v15 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 ; GFX6-NEXT: v_max_i32_e32 v13, v13, v15 -; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX6-NEXT: v_max_i32_e32 v11, v11, v13 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v10, v10, v15 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v11, v11, v15 ; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 ; GFX6-NEXT: v_max_i32_e32 v8, v8, v11 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 ; GFX6-NEXT: v_max_i32_e32 v9, v9, v11 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v7, v7, v9 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 +; GFX6-NEXT: v_max_i32_e32 v6, v6, v11 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v7, v7, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v2, v7 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; GFX6-NEXT: v_max_i32_e32 v3, v3, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; GFX6-NEXT: v_or_b32_e32 v12, v12, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; GFX6-NEXT: v_or_b32_e32 v16, v16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v20, v20, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v30, v29, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16 @@ -1125,44 +1125,36 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX7-LABEL: v_abs_v32i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v28 ; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v28, v28, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 ; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v29, v29, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 -; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v30, v30, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26 +; GFX7-NEXT: v_sub_i32_e32 v33, vcc, 0, v29 +; GFX7-NEXT: v_max_i32_e32 v28, v28, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v30 ; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v26, v26, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27 -; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v27, v27, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 +; GFX7-NEXT: v_max_i32_e32 v33, v29, v33 +; GFX7-NEXT: v_max_i32_e32 v29, v30, v32 +; GFX7-NEXT: v_sub_i32_e32 v30, vcc, 0, v27 ; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v24, v24, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 -; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v25, v25, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22 +; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v27, v27, v30 +; GFX7-NEXT: v_sub_i32_e32 v30, vcc, 0, v25 ; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v22, v22, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23 -; GFX7-NEXT: v_max_i32_e32 v23, v23, v31 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v24 +; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v25, v25, v30 +; GFX7-NEXT: v_sub_i32_e32 v30, vcc, 0, v23 +; GFX7-NEXT: v_max_i32_e32 v24, v24, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 +; GFX7-NEXT: v_max_i32_e32 v23, v23, v30 +; GFX7-NEXT: v_max_i32_e32 v22, v22, v32 ; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: v_or_b32_e32 v22, v22, v23 ; GFX7-NEXT: v_or_b32_e32 v24, v24, v25 ; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20 -; GFX7-NEXT: v_max_i32_e32 v20, v20, v29 ; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16 ; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16 @@ -1183,79 +1175,87 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v20 +; GFX7-NEXT: v_sub_i32_e32 v34, vcc, 0, v26 +; GFX7-NEXT: v_max_i32_e32 v20, v20, v32 +; GFX7-NEXT: v_max_i32_e32 v26, v26, v34 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v33 ; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_or_b32_e32 v28, v28, v30 ; GFX7-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16 ; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23 ; GFX7-NEXT: v_max_i32_e32 v23, v23, v25 -; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: v_or_b32_e32 v30, v30, v23 -; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21 -; GFX7-NEXT: v_max_i32_e32 v21, v21, v23 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18 -; GFX7-NEXT: v_max_i32_e32 v18, v18, v21 -; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19 -; GFX7-NEXT: v_max_i32_e32 v19, v19, v21 +; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v21 +; GFX7-NEXT: v_max_i32_e32 v21, v21, v25 +; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v18 +; GFX7-NEXT: v_max_i32_e32 v18, v18, v25 +; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v19 +; GFX7-NEXT: v_max_i32_e32 v19, v19, v25 ; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX7-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 ; GFX7-NEXT: v_max_i32_e32 v16, v16, v19 ; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 ; GFX7-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14 -; GFX7-NEXT: v_max_i32_e32 v14, v14, v17 -; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15 -; GFX7-NEXT: v_max_i32_e32 v15, v15, v17 +; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v14 +; GFX7-NEXT: v_max_i32_e32 v14, v14, v19 +; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v15 +; GFX7-NEXT: v_max_i32_e32 v15, v15, v19 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 ; GFX7-NEXT: v_max_i32_e32 v12, v12, v15 ; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 ; GFX7-NEXT: v_max_i32_e32 v13, v13, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX7-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX7-NEXT: v_max_i32_e32 v11, v11, v13 +; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 +; GFX7-NEXT: v_max_i32_e32 v10, v10, v15 +; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v11 +; GFX7-NEXT: v_max_i32_e32 v11, v11, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 ; GFX7-NEXT: v_max_i32_e32 v8, v8, v11 ; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 ; GFX7-NEXT: v_max_i32_e32 v9, v9, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX7-NEXT: v_max_i32_e32 v7, v7, v9 +; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 +; GFX7-NEXT: v_max_i32_e32 v6, v6, v11 +; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v7 +; GFX7-NEXT: v_max_i32_e32 v7, v7, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 ; GFX7-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 ; GFX7-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v5 +; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 +; GFX7-NEXT: v_max_i32_e32 v2, v2, v7 +; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; GFX7-NEXT: v_max_i32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 ; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; GFX7-NEXT: v_or_b32_e32 v12, v12, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; GFX7-NEXT: v_or_b32_e32 v16, v16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v20, v20, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v30, v29, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16 @@ -1277,85 +1277,85 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 -; GFX8-NEXT: v_max_i16_sdwa v18, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v1 -; GFX8-NEXT: v_max_i16_sdwa v19, v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v1, v1, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v2 -; GFX8-NEXT: v_max_i16_sdwa v18, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v2, v2, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v3 -; GFX8-NEXT: v_max_i16_sdwa v19, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v3, v3, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 -; GFX8-NEXT: v_max_i16_sdwa v18, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 -; GFX8-NEXT: v_max_i16_sdwa v19, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v6 -; GFX8-NEXT: v_max_i16_sdwa v18, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v6, v6, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v7 -; GFX8-NEXT: v_max_i16_sdwa v19, v7, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v7, v7, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v8 -; GFX8-NEXT: v_max_i16_sdwa v18, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v8, v8, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v8, v8, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v9 -; GFX8-NEXT: v_max_i16_sdwa v19, v9, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v9, v9, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v10 -; GFX8-NEXT: v_max_i16_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v10, v10, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v11 -; GFX8-NEXT: v_max_i16_sdwa v19, v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v11, v11, v18 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v12 -; GFX8-NEXT: v_max_i16_sdwa v18, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v12, v12, v19 +; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v20, v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v1 +; GFX8-NEXT: v_max_i16_e32 v1, v1, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v2, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2 +; GFX8-NEXT: v_max_i16_e32 v2, v2, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v3 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v4 +; GFX8-NEXT: v_max_i16_e32 v4, v4, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v5, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v5 +; GFX8-NEXT: v_max_i16_e32 v5, v5, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v6, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v6 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v7, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v7 +; GFX8-NEXT: v_max_i16_e32 v7, v7, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v8 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v9, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v9 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v10, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v10 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v21 +; GFX8-NEXT: v_sub_u16_sdwa v21, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v20 +; GFX8-NEXT: v_max_i16_sdwa v20, v11, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v11 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v21 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v20 +; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12 +; GFX8-NEXT: v_max_i16_sdwa v19, v12, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v12, v12, v20 ; GFX8-NEXT: v_sub_u16_sdwa v17, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v12, v12, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v13 -; GFX8-NEXT: v_max_i16_sdwa v16, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v13, v13, v18 -; GFX8-NEXT: v_sub_u16_e32 v18, 0, v15 -; GFX8-NEXT: v_or_b32_e32 v13, v13, v16 -; GFX8-NEXT: v_sub_u16_e32 v16, 0, v14 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v21, 0, v15 +; GFX8-NEXT: v_sub_u16_e32 v20, 0, v14 +; GFX8-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v13 +; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_sdwa v18, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v13, v13, v19 ; GFX8-NEXT: v_max_i16_sdwa v17, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v19, v14, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v14, v14, v16 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v18 -; GFX8-NEXT: v_or_b32_e32 v14, v14, v19 +; GFX8-NEXT: v_max_i16_sdwa v16, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v14, v14, v20 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v21 +; GFX8-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX8-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index a6a0a9a3c9015..ca5776c3691fa 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -485,13 +485,13 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; HSA-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc -; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] @@ -732,130 +732,134 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) { ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[6:7], 0x11 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v2 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v3 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] -; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] -; CI-NEXT: v_mov_b32_e32 v1, v49 -; CI-NEXT: v_mov_b32_e32 v2, v34 -; CI-NEXT: v_mov_b32_e32 v3, v39 -; CI-NEXT: v_mov_b32_e32 v4, v35 -; CI-NEXT: v_mov_b32_e32 v5, v32 -; CI-NEXT: v_mov_b32_e32 v6, v36 -; CI-NEXT: v_mov_b32_e32 v8, v48 -; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v1 +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v36, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v50, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v1, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v51, 0, v2, s[6:7] +; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v6 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v7 +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v39, 0, v3, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v32, 0, v4, s[10:11] +; CI-NEXT: v_cndmask_b32_e32 v48, 0, v5, vcc +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v49, 0, v7, s[6:7] +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[22:23] +; CI-NEXT: v_cndmask_b32_e32 v11, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[22:23] +; CI-NEXT: v_mov_b32_e32 v1, v34 +; CI-NEXT: v_mov_b32_e32 v2, v38 +; CI-NEXT: v_mov_b32_e32 v3, v36 +; CI-NEXT: v_mov_b32_e32 v4, v51 +; CI-NEXT: v_mov_b32_e32 v5, v50 +; CI-NEXT: v_mov_b32_e32 v6, v39 +; CI-NEXT: v_mov_b32_e32 v7, v37 +; CI-NEXT: v_mov_b32_e32 v8, v32 +; CI-NEXT: v_mov_b32_e32 v9, v35 +; CI-NEXT: v_mov_b32_e32 v10, v48 ; CI-NEXT: v_mov_b32_e32 v12, v33 -; CI-NEXT: v_mov_b32_e32 v14, v38 +; CI-NEXT: v_mov_b32_e32 v14, v49 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p5_to_v16p0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v36, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v31, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v3 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] -; GFX9-NEXT: v_mov_b32_e32 v1, v49 -; GFX9-NEXT: v_mov_b32_e32 v2, v34 -; GFX9-NEXT: v_mov_b32_e32 v3, v39 -; GFX9-NEXT: v_mov_b32_e32 v4, v35 -; GFX9-NEXT: v_mov_b32_e32 v5, v32 -; GFX9-NEXT: v_mov_b32_e32 v6, v36 -; GFX9-NEXT: v_mov_b32_e32 v8, v48 -; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v51, 0, v2, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v3, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v4, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v49, 0, v7, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[22:23] +; GFX9-NEXT: v_mov_b32_e32 v1, v34 +; GFX9-NEXT: v_mov_b32_e32 v2, v38 +; GFX9-NEXT: v_mov_b32_e32 v3, v36 +; GFX9-NEXT: v_mov_b32_e32 v4, v51 +; GFX9-NEXT: v_mov_b32_e32 v5, v50 +; GFX9-NEXT: v_mov_b32_e32 v6, v39 +; GFX9-NEXT: v_mov_b32_e32 v7, v37 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v35 +; GFX9-NEXT: v_mov_b32_e32 v10, v48 ; GFX9-NEXT: v_mov_b32_e32 v12, v33 -; GFX9-NEXT: v_mov_b32_e32 v14, v38 +; GFX9-NEXT: v_mov_b32_e32 v14, v49 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(5)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -937,13 +941,13 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; HSA-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc -; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] @@ -1184,130 +1188,134 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) { ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[6:7], 0x10 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v2 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v3 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] -; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] -; CI-NEXT: v_mov_b32_e32 v1, v49 -; CI-NEXT: v_mov_b32_e32 v2, v34 -; CI-NEXT: v_mov_b32_e32 v3, v39 -; CI-NEXT: v_mov_b32_e32 v4, v35 -; CI-NEXT: v_mov_b32_e32 v5, v32 -; CI-NEXT: v_mov_b32_e32 v6, v36 -; CI-NEXT: v_mov_b32_e32 v8, v48 -; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v1 +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v36, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v50, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v1, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v51, 0, v2, s[6:7] +; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v6 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v7 +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v39, 0, v3, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v32, 0, v4, s[10:11] +; CI-NEXT: v_cndmask_b32_e32 v48, 0, v5, vcc +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v49, 0, v7, s[6:7] +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[22:23] +; CI-NEXT: v_cndmask_b32_e32 v11, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[22:23] +; CI-NEXT: v_mov_b32_e32 v1, v34 +; CI-NEXT: v_mov_b32_e32 v2, v38 +; CI-NEXT: v_mov_b32_e32 v3, v36 +; CI-NEXT: v_mov_b32_e32 v4, v51 +; CI-NEXT: v_mov_b32_e32 v5, v50 +; CI-NEXT: v_mov_b32_e32 v6, v39 +; CI-NEXT: v_mov_b32_e32 v7, v37 +; CI-NEXT: v_mov_b32_e32 v8, v32 +; CI-NEXT: v_mov_b32_e32 v9, v35 +; CI-NEXT: v_mov_b32_e32 v10, v48 ; CI-NEXT: v_mov_b32_e32 v12, v33 -; CI-NEXT: v_mov_b32_e32 v14, v38 +; CI-NEXT: v_mov_b32_e32 v14, v49 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p3_to_v16p0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v36, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v31, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v3 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] -; GFX9-NEXT: v_mov_b32_e32 v1, v49 -; GFX9-NEXT: v_mov_b32_e32 v2, v34 -; GFX9-NEXT: v_mov_b32_e32 v3, v39 -; GFX9-NEXT: v_mov_b32_e32 v4, v35 -; GFX9-NEXT: v_mov_b32_e32 v5, v32 -; GFX9-NEXT: v_mov_b32_e32 v6, v36 -; GFX9-NEXT: v_mov_b32_e32 v8, v48 -; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v51, 0, v2, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v3, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v4, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v49, 0, v7, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[22:23] +; GFX9-NEXT: v_mov_b32_e32 v1, v34 +; GFX9-NEXT: v_mov_b32_e32 v2, v38 +; GFX9-NEXT: v_mov_b32_e32 v3, v36 +; GFX9-NEXT: v_mov_b32_e32 v4, v51 +; GFX9-NEXT: v_mov_b32_e32 v5, v50 +; GFX9-NEXT: v_mov_b32_e32 v6, v39 +; GFX9-NEXT: v_mov_b32_e32 v7, v37 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v35 +; GFX9-NEXT: v_mov_b32_e32 v10, v48 ; GFX9-NEXT: v_mov_b32_e32 v12, v33 -; GFX9-NEXT: v_mov_b32_e32 v14, v38 +; GFX9-NEXT: v_mov_b32_e32 v14, v49 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(3)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -1549,6 +1557,7 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-NEXT: v_mov_b32_e32 v28, v14 ; HSA-NEXT: v_mov_b32_e32 v24, v12 ; HSA-NEXT: v_mov_b32_e32 v20, v10 +; HSA-NEXT: v_mov_b32_e32 v18, v9 ; HSA-NEXT: v_mov_b32_e32 v16, v8 ; HSA-NEXT: v_mov_b32_e32 v14, v7 ; HSA-NEXT: v_mov_b32_e32 v12, v6 @@ -1561,11 +1570,10 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-NEXT: v_mov_b32_e32 v3, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0 ; HSA-NEXT: v_mov_b32_e32 v7, 0 -; HSA-NEXT: v_mov_b32_e32 v18, v9 +; HSA-NEXT: v_mov_b32_e32 v9, 0 ; HSA-NEXT: v_mov_b32_e32 v22, v11 ; HSA-NEXT: v_mov_b32_e32 v26, v13 ; HSA-NEXT: v_mov_b32_e32 v30, v15 -; HSA-NEXT: v_mov_b32_e32 v9, 0 ; HSA-NEXT: v_mov_b32_e32 v11, 0 ; HSA-NEXT: v_mov_b32_e32 v13, 0 ; HSA-NEXT: v_mov_b32_e32 v15, 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 63b7b70548baf..2e8a56566a8ed 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -180,34 +180,34 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 @@ -216,19 +216,19 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[20:23], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -315,34 +315,34 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 @@ -351,19 +351,19 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[20:23], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -450,34 +450,34 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 @@ -486,19 +486,19 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[20:23], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -585,34 +585,34 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 @@ -621,19 +621,19 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[20:23], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -720,34 +720,34 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a29 @@ -756,19 +756,19 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[20:23], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_endpgm bb: @@ -856,26 +856,26 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX908-NEXT: ; def a[0:31] ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v43, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v47, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v63, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v75, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v79, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v47, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v46, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v45, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v44, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v58, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v57, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v56, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v63, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v62, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v60, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v75, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v74, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v73, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v72, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v79, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v78, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v77, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v76, a20 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d03d6a8940b2f..c8a280dda40cc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -252,8 +252,8 @@ define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -331,8 +331,8 @@ define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -410,8 +410,8 @@ define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -775,8 +775,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -854,8 +854,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -933,8 +933,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -1298,8 +1298,8 @@ define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -1377,8 +1377,8 @@ define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -1456,8 +1456,8 @@ define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -1845,8 +1845,8 @@ define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -1924,8 +1924,8 @@ define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -2003,8 +2003,8 @@ define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -2392,8 +2392,8 @@ define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -2471,8 +2471,8 @@ define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -2550,8 +2550,8 @@ define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -2848,26 +2848,26 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_mov_b32_e32 v31, v17 -; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v33, v5 -; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -2912,26 +2912,26 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -2976,26 +2976,26 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -6821,32 +6821,30 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -6941,165 +6939,30 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v32 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 @@ -7179,48 +7042,184 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v65, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 @@ -7482,27 +7481,26 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7577,21 +7575,24 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s71, 23 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v19, s43 +; SI-NEXT: v_alignbit_b32 v21, s42, v19, 24 +; SI-NEXT: v_alignbit_b32 v23, s42, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, s42, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s45 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v14, s15 ; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_alignbit_b32 v28, s44, v19, 24 +; SI-NEXT: v_alignbit_b32 v29, s44, v19, 16 +; SI-NEXT: v_alignbit_b32 v30, s44, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s47 +; SI-NEXT: v_mov_b32_e32 v22, s28 +; SI-NEXT: v_mov_b32_e32 v27, s26 +; SI-NEXT: v_mov_b32_e32 v33, s24 ; SI-NEXT: v_mov_b32_e32 v39, s22 ; SI-NEXT: v_mov_b32_e32 v50, s20 ; SI-NEXT: v_mov_b32_e32 v53, s18 @@ -7603,33 +7604,30 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v9, s10, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s10, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s10, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s12, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s12, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s14, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s14, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s14, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 ; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v34, s46, v19, 24 +; SI-NEXT: v_alignbit_b32 v35, s46, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, s46, v19, 8 +; SI-NEXT: v_alignbit_b32 v19, s29, v22, 24 +; SI-NEXT: v_alignbit_b32 v20, s29, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s29, v22, 8 +; SI-NEXT: v_alignbit_b32 v25, s27, v27, 24 +; SI-NEXT: v_alignbit_b32 v26, s27, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, s27, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, s25, v33, 24 +; SI-NEXT: v_alignbit_b32 v32, s25, v33, 16 +; SI-NEXT: v_alignbit_b32 v33, s25, v33, 8 ; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 ; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 ; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 @@ -7692,7 +7690,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s71, s17, 8 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: v_mov_b32_e32 v19, s43 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -7702,14 +7703,16 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s47, s47, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_alignbit_b32 v21, s42, v19, 24 +; SI-NEXT: v_alignbit_b32 v23, s42, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, s42, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s45 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -7718,7 +7721,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 @@ -7727,18 +7729,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v14, s15 ; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_alignbit_b32 v28, s44, v19, 24 +; SI-NEXT: v_alignbit_b32 v29, s44, v19, 16 +; SI-NEXT: v_alignbit_b32 v30, s44, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s47 +; SI-NEXT: v_mov_b32_e32 v22, s28 +; SI-NEXT: v_mov_b32_e32 v27, s26 +; SI-NEXT: v_mov_b32_e32 v33, s24 ; SI-NEXT: v_mov_b32_e32 v39, s22 ; SI-NEXT: v_mov_b32_e32 v50, s20 ; SI-NEXT: v_mov_b32_e32 v53, s18 @@ -7750,33 +7751,30 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v9, s10, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s10, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s10, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s12, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s12, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s14, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s14, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s14, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 ; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v34, s46, v19, 24 +; SI-NEXT: v_alignbit_b32 v35, s46, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, s46, v19, 8 +; SI-NEXT: v_alignbit_b32 v19, s29, v22, 24 +; SI-NEXT: v_alignbit_b32 v20, s29, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s29, v22, 8 +; SI-NEXT: v_alignbit_b32 v25, s27, v27, 24 +; SI-NEXT: v_alignbit_b32 v26, s27, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, s27, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, s25, v33, 24 +; SI-NEXT: v_alignbit_b32 v32, s25, v33, 16 +; SI-NEXT: v_alignbit_b32 v33, s25, v33, 8 ; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 ; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 ; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 @@ -7932,149 +7930,150 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v38, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; SI-NEXT: v_or_b32_e32 v35, s4, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; SI-NEXT: v_or_b32_e32 v33, s4, v33 ; SI-NEXT: s_and_b32 s4, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s49, 24 -; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v35, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_add_i32_e32 v32, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v32, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v27, s4, v27 ; SI-NEXT: s_and_b32 s4, s27, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s38, 24 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v26, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 ; SI-NEXT: s_and_b32 s4, s29, 0xff ; SI-NEXT: s_lshl_b32 s5, s37, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s35, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s46, 0xff ; SI-NEXT: s_lshl_b32 s5, s34, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v35 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s31, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v34 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s30, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s44, 0xff ; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v29 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v28 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s93, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 64, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v24 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s42, 0xff ; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s91, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s90, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff @@ -8102,75 +8101,76 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 ; SI-NEXT: s_and_b32 s4, s14, 0xff ; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s14, s76, 24 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s14, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x5c, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s12, s73, 24 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s12, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x60, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: s_and_b32 s4, s10, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s63, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s10, s62, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff @@ -8258,41 +8258,59 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr35 @@ -8317,34 +8335,16 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -8924,17 +8924,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 50 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 50 ; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: v_readlane_b32 s17, v21, 49 ; VI-NEXT: v_readlane_b32 s18, v21, 48 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen @@ -9469,108 +9469,107 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 ; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 1 ; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 ; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 ; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 ; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 ; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 ; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 ; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 ; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 ; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 ; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 ; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 ; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 ; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 ; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 ; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 ; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 ; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 ; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 ; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 ; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 ; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 ; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 ; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 ; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 ; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 ; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 ; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 ; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 ; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 ; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 ; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 ; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 ; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 ; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 ; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 ; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 ; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 ; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 ; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 ; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 ; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 ; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 ; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 ; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 ; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 ; GFX9-NEXT: s_lshr_b32 s84, s27, 8 ; GFX9-NEXT: s_lshr_b32 s85, s26, 16 ; GFX9-NEXT: s_lshr_b32 s86, s26, 8 @@ -9598,8 +9597,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s70, s17, 16 ; GFX9-NEXT: s_lshr_b32 s71, s17, 8 ; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 @@ -9619,123 +9618,121 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s5, s5, 3 ; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 ; GFX9-NEXT: s_lshr_b32 s46, s5, 16 ; GFX9-NEXT: s_add_i32 s4, s4, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 1 ; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 ; GFX9-NEXT: s_lshr_b32 s46, s4, 16 ; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 ; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 ; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 ; GFX9-NEXT: s_lshr_b32 s46, s7, 16 ; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 ; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 ; GFX9-NEXT: s_lshr_b32 s46, s6, 16 ; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 ; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 ; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 ; GFX9-NEXT: s_lshr_b32 s46, s9, 16 ; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 ; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 ; GFX9-NEXT: s_lshr_b32 s46, s8, 16 ; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 ; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 ; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 ; GFX9-NEXT: s_lshr_b32 s46, s11, 16 ; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 ; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 ; GFX9-NEXT: s_lshr_b32 s46, s10, 16 ; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 ; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 ; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 ; GFX9-NEXT: s_lshr_b32 s46, s13, 16 ; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 ; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 ; GFX9-NEXT: s_lshr_b32 s46, s12, 16 ; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 ; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 ; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 ; GFX9-NEXT: s_lshr_b32 s46, s15, 16 ; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 ; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 ; GFX9-NEXT: s_lshr_b32 s46, s14, 16 ; GFX9-NEXT: s_add_i32 s41, s41, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 ; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 ; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 ; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 ; GFX9-NEXT: s_lshr_b32 s46, s40, 16 ; GFX9-NEXT: s_add_i32 s43, s43, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 ; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 ; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 ; GFX9-NEXT: s_lshr_b32 s46, s43, 16 ; GFX9-NEXT: s_add_i32 s42, s42, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 ; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 ; GFX9-NEXT: s_lshr_b32 s46, s42, 16 ; GFX9-NEXT: s_add_i32 s45, s45, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 ; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 ; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 ; GFX9-NEXT: s_add_i32 s44, s44, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 ; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 ; GFX9-NEXT: s_lshr_b32 s46, s44, 16 ; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 ; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 ; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 ; GFX9-NEXT: s_lshr_b32 s46, s29, 16 ; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 ; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 ; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s19, s19, 3 @@ -9748,10 +9745,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 ; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 ; GFX9-NEXT: s_lshr_b32 s84, s27, 8 ; GFX9-NEXT: s_lshr_b32 s85, s26, 16 ; GFX9-NEXT: s_lshr_b32 s86, s26, 8 @@ -9779,8 +9777,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s70, s17, 16 ; GFX9-NEXT: s_lshr_b32 s71, s17, 8 ; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 @@ -9797,22 +9795,22 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 ; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_lshl_b32 s47, s81, 8 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: s_lshl_b32 s46, s36, 8 -; GFX9-NEXT: s_and_b32 s47, s80, 0xff -; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_or_b32 s16, s16, s47 +; GFX9-NEXT: s_lshl_b32 s47, s36, 8 +; GFX9-NEXT: s_and_b32 s57, s80, 0xff +; GFX9-NEXT: s_or_b32 s47, s57, s47 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s46, s46, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s47, s47, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s47 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s71, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: s_and_b32 s17, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s46, s69, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_lshl_b32 s47, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s47 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -9910,16 +9908,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s84, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_and_b32 s17, s83, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s82, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: v_readlane_b32 s16, v21, 49 +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_and_b32 s17, s28, 0xff -; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: v_readlane_b32 s18, v21, 48 ; GFX9-NEXT: s_or_b32 s16, s17, s16 ; GFX9-NEXT: s_lshl_b32 s17, s88, 8 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff @@ -9927,20 +9926,20 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_readlane_b32 s17, v21, 47 ; GFX9-NEXT: v_mov_b32_e32 v13, s16 ; GFX9-NEXT: s_and_b32 s16, s29, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 48 -; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: v_readlane_b32 s18, v21, 45 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -9958,75 +9957,75 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s44, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s78, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: v_readlane_b32 s17, v21, 42 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s45, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 43 -; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: v_readlane_b32 s18, v21, 40 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s42, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s76, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: v_readlane_b32 s17, v21, 37 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s43, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 38 -; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: v_readlane_b32 s18, v21, 35 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s40, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s74, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: v_readlane_b32 s17, v21, 32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s41, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 33 -; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: v_readlane_b32 s17, v21, 31 +; GFX9-NEXT: v_readlane_b32 s18, v21, 30 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -10035,11 +10034,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: v_readlane_b32 s16, v21, 29 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: v_readlane_b32 s16, v21, 28 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s72, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -10049,11 +10048,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: v_readlane_b32 s15, v21, 27 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v21, 28 -; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: v_readlane_b32 s15, v21, 26 +; GFX9-NEXT: v_readlane_b32 s16, v21, 25 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -10062,11 +10061,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: v_readlane_b32 s14, v21, 24 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: v_readlane_b32 s14, v21, 23 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s15, s62, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 @@ -10076,11 +10075,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: v_readlane_b32 s13, v21, 22 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v21, 23 -; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: v_readlane_b32 s13, v21, 21 +; GFX9-NEXT: v_readlane_b32 s14, v21, 20 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -10089,11 +10088,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: v_readlane_b32 s12, v21, 19 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: v_readlane_b32 s12, v21, 18 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s13, s60, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 @@ -10103,11 +10102,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: v_readlane_b32 s11, v21, 17 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v21, 18 -; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: v_readlane_b32 s11, v21, 16 +; GFX9-NEXT: v_readlane_b32 s12, v21, 15 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -10116,11 +10115,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: v_readlane_b32 s10, v21, 14 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: v_readlane_b32 s10, v21, 13 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s58, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 @@ -10130,11 +10129,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: v_readlane_b32 s9, v21, 12 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v21, 13 -; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: v_readlane_b32 s9, v21, 11 +; GFX9-NEXT: v_readlane_b32 s10, v21, 10 ; GFX9-NEXT: s_and_b32 s9, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 @@ -10143,11 +10142,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: v_readlane_b32 s8, v21, 9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: v_readlane_b32 s8, v21, 8 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s56, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 @@ -10157,11 +10156,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: v_readlane_b32 s7, v21, 7 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v21, 8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 6 +; GFX9-NEXT: v_readlane_b32 s8, v21, 5 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 @@ -10170,14 +10169,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s6, v21, 3 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_lshl_b32 s7, s46, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -10185,11 +10183,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: v_readlane_b32 s5, v21, 2 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v21, 3 -; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: v_readlane_b32 s5, v21, 1 +; GFX9-NEXT: v_readlane_b32 s6, v21, 0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -10198,7 +10196,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s99, v20, 35 ; GFX9-NEXT: v_readlane_b32 s98, v20, 34 @@ -10243,16 +10240,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: v_writelane_b32 v21, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr70 @@ -10280,7 +10272,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -10296,100 +10289,103 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: v_writelane_b32 v21, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v32i32_to_v128i8_scalar: @@ -10397,221 +10393,332 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v16, s32 -; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v18, s32 +; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v20, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v21, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v16, s30, 0 -; GFX11-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-NEXT: v_writelane_b32 v18, s30, 0 +; GFX11-NEXT: v_writelane_b32 v19, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s40, v1 ; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v16, s31, 1 -; GFX11-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-NEXT: v_writelane_b32 v18, s31, 1 +; GFX11-NEXT: v_writelane_b32 v19, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v16, s34, 2 -; GFX11-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-NEXT: v_writelane_b32 v18, s34, 2 +; GFX11-NEXT: v_writelane_b32 v19, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v16, s35, 3 -; GFX11-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-NEXT: v_writelane_b32 v18, s35, 3 +; GFX11-NEXT: v_writelane_b32 v19, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v16, s36, 4 -; GFX11-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-NEXT: v_writelane_b32 v18, s36, 4 +; GFX11-NEXT: v_writelane_b32 v19, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v16, s37, 5 -; GFX11-NEXT: v_writelane_b32 v17, s101, 5 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: v_writelane_b32 v18, s37, 5 +; GFX11-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v16, s38, 6 -; GFX11-NEXT: v_writelane_b32 v17, s102, 6 -; GFX11-NEXT: v_writelane_b32 v16, s39, 7 -; GFX11-NEXT: v_writelane_b32 v17, s103, 7 -; GFX11-NEXT: v_writelane_b32 v16, s48, 8 -; GFX11-NEXT: v_writelane_b32 v17, s104, 8 -; GFX11-NEXT: v_writelane_b32 v16, s49, 9 -; GFX11-NEXT: v_writelane_b32 v16, s50, 10 -; GFX11-NEXT: v_writelane_b32 v16, s51, 11 -; GFX11-NEXT: v_writelane_b32 v16, s52, 12 -; GFX11-NEXT: v_writelane_b32 v16, s53, 13 -; GFX11-NEXT: v_writelane_b32 v16, s54, 14 -; GFX11-NEXT: v_writelane_b32 v16, s55, 15 -; GFX11-NEXT: v_writelane_b32 v16, s64, 16 -; GFX11-NEXT: v_writelane_b32 v16, s65, 17 -; GFX11-NEXT: v_writelane_b32 v16, s66, 18 -; GFX11-NEXT: v_writelane_b32 v16, s67, 19 -; GFX11-NEXT: v_writelane_b32 v16, s68, 20 -; GFX11-NEXT: v_writelane_b32 v16, s69, 21 -; GFX11-NEXT: v_writelane_b32 v16, s70, 22 -; GFX11-NEXT: v_writelane_b32 v16, s71, 23 -; GFX11-NEXT: v_writelane_b32 v16, s80, 24 -; GFX11-NEXT: v_writelane_b32 v16, s81, 25 -; GFX11-NEXT: v_writelane_b32 v16, s82, 26 -; GFX11-NEXT: v_writelane_b32 v16, s83, 27 -; GFX11-NEXT: v_writelane_b32 v16, s84, 28 -; GFX11-NEXT: v_writelane_b32 v16, s85, 29 -; GFX11-NEXT: v_writelane_b32 v16, s86, 30 -; GFX11-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr20 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v18, s38, 6 +; GFX11-NEXT: v_writelane_b32 v19, s102, 6 +; GFX11-NEXT: v_writelane_b32 v18, s39, 7 +; GFX11-NEXT: v_writelane_b32 v19, s103, 7 +; GFX11-NEXT: v_writelane_b32 v18, s48, 8 +; GFX11-NEXT: v_writelane_b32 v19, s104, 8 +; GFX11-NEXT: v_writelane_b32 v18, s49, 9 +; GFX11-NEXT: v_writelane_b32 v18, s50, 10 +; GFX11-NEXT: v_writelane_b32 v18, s51, 11 +; GFX11-NEXT: v_writelane_b32 v18, s52, 12 +; GFX11-NEXT: v_writelane_b32 v18, s53, 13 +; GFX11-NEXT: v_writelane_b32 v18, s54, 14 +; GFX11-NEXT: v_writelane_b32 v18, s55, 15 +; GFX11-NEXT: v_writelane_b32 v18, s64, 16 +; GFX11-NEXT: v_writelane_b32 v18, s65, 17 +; GFX11-NEXT: v_writelane_b32 v18, s66, 18 +; GFX11-NEXT: v_writelane_b32 v18, s67, 19 +; GFX11-NEXT: v_writelane_b32 v18, s68, 20 +; GFX11-NEXT: v_writelane_b32 v18, s69, 21 +; GFX11-NEXT: v_writelane_b32 v18, s70, 22 +; GFX11-NEXT: v_writelane_b32 v18, s71, 23 +; GFX11-NEXT: v_writelane_b32 v18, s80, 24 +; GFX11-NEXT: v_writelane_b32 v18, s81, 25 +; GFX11-NEXT: v_writelane_b32 v18, s82, 26 +; GFX11-NEXT: v_writelane_b32 v18, s83, 27 +; GFX11-NEXT: v_writelane_b32 v18, s84, 28 +; GFX11-NEXT: v_writelane_b32 v18, s85, 29 +; GFX11-NEXT: v_writelane_b32 v18, s86, 30 +; GFX11-NEXT: v_writelane_b32 v18, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s44, s7, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 2 +; GFX11-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-NEXT: s_lshr_b32 s34, s5, 16 +; GFX11-NEXT: s_lshr_b32 s35, s5, 8 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 3 +; GFX11-NEXT: s_lshr_b32 s44, s7, 8 +; GFX11-NEXT: s_lshr_b32 s43, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s14, 16 +; GFX11-NEXT: s_lshr_b32 s49, s14, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 4 +; GFX11-NEXT: s_lshr_b32 s44, s6, 16 +; GFX11-NEXT: s_lshr_b32 s55, s28, 16 +; GFX11-NEXT: s_lshr_b32 s64, s28, 8 +; GFX11-NEXT: s_lshr_b32 s65, s24, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 5 +; GFX11-NEXT: s_lshr_b32 s44, s6, 8 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s37, s23, 24 +; GFX11-NEXT: s_lshr_b32 s38, s23, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 6 +; GFX11-NEXT: s_lshr_b32 s44, s9, 24 +; GFX11-NEXT: s_lshr_b32 s39, s23, 8 +; GFX11-NEXT: s_lshr_b32 s50, s22, 16 +; GFX11-NEXT: s_lshr_b32 s51, s22, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 7 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s52, s21, 24 +; GFX11-NEXT: s_lshr_b32 s53, s21, 16 +; GFX11-NEXT: s_lshr_b32 s54, s21, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 8 +; GFX11-NEXT: s_lshr_b32 s44, s9, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 9 +; GFX11-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 10 +; GFX11-NEXT: s_lshr_b32 s44, s8, 8 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 11 +; GFX11-NEXT: s_lshr_b32 s44, s11, 24 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 12 +; GFX11-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 13 +; GFX11-NEXT: s_lshr_b32 s44, s11, 8 +; GFX11-NEXT: s_lshr_b32 s98, s2, 16 +; GFX11-NEXT: s_lshr_b32 s99, s2, 8 +; GFX11-NEXT: s_lshr_b32 s100, s1, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 14 +; GFX11-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-NEXT: s_lshr_b32 s101, s1, 16 +; GFX11-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 15 +; GFX11-NEXT: s_lshr_b32 s44, s10, 8 +; GFX11-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 17 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 18 +; GFX11-NEXT: s_lshr_b32 s44, s13, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 19 +; GFX11-NEXT: s_lshr_b32 s44, s12, 16 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 20 +; GFX11-NEXT: s_lshr_b32 s44, s12, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 21 +; GFX11-NEXT: s_lshr_b32 s44, s15, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 22 +; GFX11-NEXT: s_lshr_b32 s44, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 23 +; GFX11-NEXT: s_lshr_b32 s44, s15, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 24 +; GFX11-NEXT: s_lshr_b32 s44, s41, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 25 +; GFX11-NEXT: s_lshr_b32 s44, s41, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 26 +; GFX11-NEXT: s_lshr_b32 s44, s41, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 27 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 28 +; GFX11-NEXT: s_lshr_b32 s44, s40, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 29 +; GFX11-NEXT: s_lshr_b32 s44, s29, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 30 +; GFX11-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 31 +; GFX11-NEXT: s_lshr_b32 s44, s29, 8 +; GFX11-NEXT: v_writelane_b32 v20, s44, 0 +; GFX11-NEXT: s_lshr_b32 s44, s27, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 1 +; GFX11-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-NEXT: v_writelane_b32 v20, s44, 2 +; GFX11-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 3 +; GFX11-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-NEXT: v_writelane_b32 v20, s44, 4 +; GFX11-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 5 +; GFX11-NEXT: s_lshr_b32 s44, s25, 24 +; GFX11-NEXT: v_writelane_b32 v20, s44, 6 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 7 +; GFX11-NEXT: s_lshr_b32 s44, s25, 8 +; GFX11-NEXT: v_writelane_b32 v20, s44, 8 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 0 +; GFX11-NEXT: v_writelane_b32 v21, s45, 1 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 ; GFX11-NEXT: s_branch .LBB13_3 ; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: s_mov_b32 vcc_hi, -1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -10621,8 +10728,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -10633,7 +10738,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -10644,696 +10748,601 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 ; GFX11-NEXT: .LBB13_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-NEXT: s_mov_b32 vcc_hi, s36 +; GFX11-NEXT: s_mov_b32 s36, s42 +; GFX11-NEXT: s_mov_b32 s42, s43 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 ; GFX11-NEXT: ; %bb.4: ; %cmp.true -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 -; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_add_i32 s19, s19, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 ; GFX11-NEXT: s_add_i32 s7, s7, 3 ; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s7, 24 ; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s7, 16 ; GFX11-NEXT: s_add_i32 s8, s8, 3 ; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s7, 8 ; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 ; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 ; GFX11-NEXT: s_add_i32 s41, s41, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_add_i32 s40, s40, 3 -; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s6, 8 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s9, 24 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 8 +; GFX11-NEXT: s_lshr_b32 s43, s9, 8 ; GFX11-NEXT: s_add_i32 s2, s2, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 9 +; GFX11-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 10 +; GFX11-NEXT: s_lshr_b32 s43, s8, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 11 +; GFX11-NEXT: s_lshr_b32 s43, s11, 24 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: v_writelane_b32 v21, s43, 12 +; GFX11-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s5, 24 +; GFX11-NEXT: s_lshr_b32 s34, s5, 16 +; GFX11-NEXT: s_lshr_b32 s35, s5, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 13 +; GFX11-NEXT: s_lshr_b32 s43, s11, 8 +; GFX11-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s14, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 14 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s49, s14, 8 +; GFX11-NEXT: s_lshr_b32 s55, s28, 16 +; GFX11-NEXT: s_lshr_b32 s64, s28, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 15 +; GFX11-NEXT: s_lshr_b32 s43, s10, 8 +; GFX11-NEXT: s_lshr_b32 s65, s24, 16 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s37, s23, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 16 +; GFX11-NEXT: s_lshr_b32 s43, s13, 24 +; GFX11-NEXT: s_lshr_b32 s38, s23, 16 +; GFX11-NEXT: s_lshr_b32 s39, s23, 8 +; GFX11-NEXT: s_lshr_b32 s50, s22, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 17 +; GFX11-NEXT: s_lshr_b32 s43, s13, 16 +; GFX11-NEXT: s_lshr_b32 s51, s22, 8 +; GFX11-NEXT: s_lshr_b32 s52, s21, 24 +; GFX11-NEXT: s_lshr_b32 s53, s21, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 18 +; GFX11-NEXT: s_lshr_b32 s43, s13, 8 +; GFX11-NEXT: s_lshr_b32 s54, s21, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 19 +; GFX11-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 20 +; GFX11-NEXT: s_lshr_b32 s43, s12, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 21 +; GFX11-NEXT: s_lshr_b32 s43, s15, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 22 +; GFX11-NEXT: s_lshr_b32 s43, s15, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s15, 8 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: s_lshr_b32 s98, s2, 16 +; GFX11-NEXT: s_lshr_b32 s99, s2, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 24 +; GFX11-NEXT: s_lshr_b32 s43, s41, 24 +; GFX11-NEXT: s_lshr_b32 s100, s1, 24 +; GFX11-NEXT: s_lshr_b32 s101, s1, 16 +; GFX11-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 25 +; GFX11-NEXT: s_lshr_b32 s43, s41, 16 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 26 +; GFX11-NEXT: s_lshr_b32 s43, s41, 8 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 27 +; GFX11-NEXT: s_lshr_b32 s43, s40, 16 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 28 +; GFX11-NEXT: s_lshr_b32 s43, s40, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 29 +; GFX11-NEXT: s_lshr_b32 s43, s29, 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 30 +; GFX11-NEXT: s_lshr_b32 s43, s29, 16 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-NEXT: v_writelane_b32 v21, s43, 31 +; GFX11-NEXT: s_lshr_b32 s43, s29, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 0 +; GFX11-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 0 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 +; GFX11-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-NEXT: v_writelane_b32 v21, s45, 1 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s27, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: v_writelane_b32 v20, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s26, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s25, 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-NEXT: v_writelane_b32 v20, s43, 8 ; GFX11-NEXT: .LBB13_5: ; %end -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-NEXT: s_lshl_b32 s43, s104, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 -; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-NEXT: s_and_b32 s45, s103, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s43 +; GFX11-NEXT: s_lshl_b32 s43, s30, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 -; GFX11-NEXT: s_or_b32 s0, s0, s44 +; GFX11-NEXT: s_or_b32 s43, s45, s43 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_lshl_b32 s45, s100, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s43 +; GFX11-NEXT: s_lshl_b32 s43, s102, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s43 +; GFX11-NEXT: s_and_b32 s43, s101, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_or_b32 s43, s43, s45 +; GFX11-NEXT: s_and_b32 s45, s98, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s43 +; GFX11-NEXT: s_lshl_b32 s43, s99, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s43, s45, s43 +; GFX11-NEXT: s_lshl_b32 s45, s87, 8 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_lshl_b32 s0, s86, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s97, 8 ; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-NEXT: s_or_b32 s3, s3, s43 +; GFX11-NEXT: s_and_b32 s43, s96, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_or_b32 s43, s43, s45 ; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 ; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s43 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: s_and_b32 s2, s85, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-NEXT: s_lshl_b32 s2, s84, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s82, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-NEXT: s_and_b32 s16, s80, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-NEXT: s_and_b32 s2, s83, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s18, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s17, s69, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-NEXT: s_lshl_b32 s2, s81, 8 +; GFX11-NEXT: s_and_b32 s18, s20, 0xff ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_lshl_b32 s3, s90, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-NEXT: s_lshl_b32 s16, s71, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s19, s67, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-NEXT: s_and_b32 s16, s70, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s88, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_lshl_b32 s17, s68, 8 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_lshl_b32 s18, s78, 8 +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_or_b32 s18, s19, s18 +; GFX11-NEXT: s_and_b32 s19, s28, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: s_or_b32 s17, s17, s18 +; GFX11-NEXT: s_and_b32 s18, s24, 0xff +; GFX11-NEXT: v_mov_b32_e32 v10, s17 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: s_and_b32 s1, s14, 0xff +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_and_b32 s18, s65, 0xff +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_or_b32 s0, s18, s0 +; GFX11-NEXT: s_lshl_b32 s18, s64, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s18, s19, s18 +; GFX11-NEXT: s_or_b32 s0, s17, s0 +; GFX11-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-NEXT: v_mov_b32_e32 v14, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s16, 16 +; GFX11-NEXT: s_and_b32 s19, s55, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s2, s49, 8 +; GFX11-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-NEXT: s_lshl_b32 s0, s74, 8 +; GFX11-NEXT: s_and_b32 s3, s48, 0xff +; GFX11-NEXT: s_or_b32 s17, s19, s17 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s17 +; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX11-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s54, 8 +; GFX11-NEXT: s_and_b32 s2, s53, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s52, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s51, 8 +; GFX11-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-NEXT: s_and_b32 s14, s50, 0xff +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_or_b32 s2, s14, s3 +; GFX11-NEXT: s_and_b32 s3, s23, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s39, 8 +; GFX11-NEXT: s_and_b32 s16, s38, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s37, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s14 +; GFX11-NEXT: s_or_b32 s14, s16, s17 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s14 +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_mov_b32_e32 v13, s2 +; GFX11-NEXT: v_readlane_b32 s1, v21, 16 +; GFX11-NEXT: v_readlane_b32 s2, v21, 15 +; GFX11-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 -; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_readlane_b32 s2, v20, 7 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_readlane_b32 s1, v20, 8 +; GFX11-NEXT: v_readlane_b32 s3, v20, 6 +; GFX11-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-NEXT: s_and_b32 s0, s25, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s2, v20, 5 +; GFX11-NEXT: v_readlane_b32 s3, v20, 4 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s60, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-NEXT: s_and_b32 s1, s26, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-NEXT: s_or_b32 s2, s3, s10 +; GFX11-NEXT: v_readlane_b32 s10, v20, 3 +; GFX11-NEXT: v_readlane_b32 s14, v20, 2 +; GFX11-NEXT: v_readlane_b32 s16, v20, 1 +; GFX11-NEXT: s_and_b32 s3, s27, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s10 +; GFX11-NEXT: s_or_b32 s10, s14, s16 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s10 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: s_and_b32 s16, s23, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-NEXT: v_readlane_b32 s1, v21, 6 +; GFX11-NEXT: v_readlane_b32 s2, v21, 5 +; GFX11-NEXT: v_mov_b32_e32 v15, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s56, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 -; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 -; GFX11-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readlane_b32 s2, v21, 31 ; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_readlane_b32 s1, v20, 0 +; GFX11-NEXT: v_readlane_b32 s3, v21, 30 +; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11-NEXT: v_mov_b32_e32 v13, s0 +; GFX11-NEXT: s_and_b32 s0, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 -; GFX11-NEXT: s_lshl_b32 s17, s76, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s2, v21, 29 +; GFX11-NEXT: v_readlane_b32 s3, v21, 28 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s58, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_and_b32 s1, s40, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 27 +; GFX11-NEXT: v_readlane_b32 s10, v21, 26 +; GFX11-NEXT: v_readlane_b32 s14, v21, 25 +; GFX11-NEXT: s_and_b32 s3, s41, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s10, s14 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 24 +; GFX11-NEXT: v_readlane_b32 s3, v21, 23 +; GFX11-NEXT: v_readlane_b32 s6, v21, 22 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_and_b32 s0, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_readlane_b32 s2, v21, 21 +; GFX11-NEXT: v_readlane_b32 s3, v21, 20 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s46, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s14 -; GFX11-NEXT: s_and_b32 s12, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 -; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xffff -; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_and_b32 s1, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 19 +; GFX11-NEXT: v_readlane_b32 s10, v21, 18 +; GFX11-NEXT: v_readlane_b32 s12, v21, 17 +; GFX11-NEXT: s_and_b32 s3, s13, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s10, s12 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 14 +; GFX11-NEXT: v_readlane_b32 s3, v21, 13 +; GFX11-NEXT: v_readlane_b32 s6, v21, 12 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_and_b32 s0, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: v_readlane_b32 s2, v21, 11 +; GFX11-NEXT: v_readlane_b32 s3, v21, 10 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s44, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s10 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 -; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s8, s8, 0xffff -; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_and_b32 s1, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: s_and_b32 s3, s9, 0xff +; GFX11-NEXT: v_readlane_b32 s6, v21, 9 +; GFX11-NEXT: v_readlane_b32 s8, v21, 8 +; GFX11-NEXT: v_readlane_b32 s9, v21, 7 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v11, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 4 +; GFX11-NEXT: v_readlane_b32 s3, v21, 3 +; GFX11-NEXT: v_readlane_b32 s6, v21, 2 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_and_b32 s0, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 0 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s6 -; GFX11-NEXT: s_and_b32 s4, s5, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s5 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v17, 8 -; GFX11-NEXT: v_readlane_b32 s103, v17, 7 -; GFX11-NEXT: v_readlane_b32 s102, v17, 6 -; GFX11-NEXT: v_readlane_b32 s101, v17, 5 -; GFX11-NEXT: v_readlane_b32 s96, v17, 0 -; GFX11-NEXT: v_readlane_b32 s87, v16, 31 -; GFX11-NEXT: v_readlane_b32 s85, v16, 29 -; GFX11-NEXT: v_readlane_b32 s84, v16, 28 -; GFX11-NEXT: v_readlane_b32 s83, v16, 27 -; GFX11-NEXT: v_readlane_b32 s82, v16, 26 -; GFX11-NEXT: v_readlane_b32 s81, v16, 25 -; GFX11-NEXT: v_readlane_b32 s80, v16, 24 -; GFX11-NEXT: v_readlane_b32 s71, v16, 23 -; GFX11-NEXT: v_readlane_b32 s70, v16, 22 -; GFX11-NEXT: v_readlane_b32 s68, v16, 20 -; GFX11-NEXT: v_readlane_b32 s67, v16, 19 -; GFX11-NEXT: v_readlane_b32 s66, v16, 18 -; GFX11-NEXT: v_readlane_b32 s65, v16, 17 -; GFX11-NEXT: v_readlane_b32 s64, v16, 16 -; GFX11-NEXT: v_readlane_b32 s55, v16, 15 -; GFX11-NEXT: v_readlane_b32 s54, v16, 14 -; GFX11-NEXT: v_readlane_b32 s53, v16, 13 -; GFX11-NEXT: v_readlane_b32 s52, v16, 12 -; GFX11-NEXT: v_readlane_b32 s51, v16, 11 -; GFX11-NEXT: v_readlane_b32 s50, v16, 10 -; GFX11-NEXT: v_readlane_b32 s49, v16, 9 -; GFX11-NEXT: v_readlane_b32 s48, v16, 8 -; GFX11-NEXT: v_readlane_b32 s39, v16, 7 -; GFX11-NEXT: v_readlane_b32 s38, v16, 6 -; GFX11-NEXT: v_readlane_b32 s37, v16, 5 -; GFX11-NEXT: v_readlane_b32 s36, v16, 4 -; GFX11-NEXT: v_readlane_b32 s35, v16, 3 -; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: s_and_b32 s1, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s42, 8 +; GFX11-NEXT: s_and_b32 s3, s36, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s6, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: s_and_b32 s3, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s35, 8 +; GFX11-NEXT: s_and_b32 s5, s34, 0xff +; GFX11-NEXT: s_lshl_b32 s6, vcc_hi, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s1 +; GFX11-NEXT: v_mov_b32_e32 v16, s2 +; GFX11-NEXT: v_readlane_b32 s7, v21, 1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:112 +; GFX11-NEXT: v_readlane_b32 s104, v19, 8 +; GFX11-NEXT: v_readlane_b32 s103, v19, 7 +; GFX11-NEXT: v_readlane_b32 s102, v19, 6 +; GFX11-NEXT: v_readlane_b32 s101, v19, 5 +; GFX11-NEXT: v_readlane_b32 s100, v19, 4 +; GFX11-NEXT: v_readlane_b32 s99, v19, 3 +; GFX11-NEXT: v_readlane_b32 s98, v19, 2 +; GFX11-NEXT: v_readlane_b32 s97, v19, 1 +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s87, v18, 31 +; GFX11-NEXT: v_readlane_b32 s86, v18, 30 +; GFX11-NEXT: v_readlane_b32 s85, v18, 29 +; GFX11-NEXT: v_readlane_b32 s84, v18, 28 +; GFX11-NEXT: v_readlane_b32 s83, v18, 27 +; GFX11-NEXT: v_readlane_b32 s82, v18, 26 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: v_readlane_b32 s80, v18, 24 +; GFX11-NEXT: v_readlane_b32 s71, v18, 23 +; GFX11-NEXT: v_readlane_b32 s70, v18, 22 +; GFX11-NEXT: v_readlane_b32 s69, v18, 21 +; GFX11-NEXT: v_readlane_b32 s68, v18, 20 +; GFX11-NEXT: v_readlane_b32 s67, v18, 19 +; GFX11-NEXT: v_readlane_b32 s66, v18, 18 +; GFX11-NEXT: v_readlane_b32 s65, v18, 17 +; GFX11-NEXT: v_readlane_b32 s64, v18, 16 +; GFX11-NEXT: v_readlane_b32 s55, v18, 15 +; GFX11-NEXT: v_readlane_b32 s54, v18, 14 +; GFX11-NEXT: v_readlane_b32 s53, v18, 13 +; GFX11-NEXT: v_readlane_b32 s52, v18, 12 +; GFX11-NEXT: v_readlane_b32 s51, v18, 11 +; GFX11-NEXT: v_readlane_b32 s50, v18, 10 +; GFX11-NEXT: v_readlane_b32 s49, v18, 9 +; GFX11-NEXT: v_readlane_b32 s48, v18, 8 +; GFX11-NEXT: v_readlane_b32 s39, v18, 7 +; GFX11-NEXT: v_readlane_b32 s38, v18, 6 +; GFX11-NEXT: v_readlane_b32 s37, v18, 5 +; GFX11-NEXT: v_readlane_b32 s36, v18, 4 +; GFX11-NEXT: v_readlane_b32 s35, v18, 3 +; GFX11-NEXT: v_readlane_b32 s34, v18, 2 +; GFX11-NEXT: v_readlane_b32 s31, v18, 1 +; GFX11-NEXT: v_readlane_b32 s30, v18, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v16, off, s32 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v18, off, s32 +; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -20593,10 +20602,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248 @@ -20629,7 +20638,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124 @@ -20690,25 +20699,25 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) @@ -20749,253 +20758,262 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff @@ -21023,18 +21041,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 @@ -21213,7 +21221,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -21287,9 +21295,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -21323,13 +21331,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -21534,10 +21542,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:232 ; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 @@ -21570,7 +21578,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 ; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:140 ; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 @@ -21631,25 +21639,25 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) @@ -21690,253 +21698,262 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff @@ -21964,18 +21981,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 @@ -22154,7 +22161,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -22228,9 +22235,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -22264,13 +22271,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -23944,8 +23951,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -24023,8 +24030,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -24893,9 +24900,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -24911,9 +24918,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -24929,9 +24936,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -24947,9 +24954,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -24965,9 +24972,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -24983,9 +24990,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25001,9 +25008,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25019,9 +25026,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25037,9 +25044,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25055,9 +25062,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25073,9 +25080,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25091,9 +25098,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25109,9 +25116,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25127,9 +25134,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25145,9 +25152,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25163,10 +25170,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25182,9 +25189,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25200,9 +25207,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25218,9 +25225,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25236,9 +25243,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25254,9 +25261,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25272,9 +25279,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25290,9 +25297,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25308,9 +25315,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25326,9 +25333,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25344,9 +25351,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25362,9 +25369,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25380,9 +25387,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25398,9 +25405,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25416,9 +25423,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25434,9 +25441,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -25486,9 +25493,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25501,9 +25508,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25516,9 +25523,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25531,9 +25538,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25546,9 +25553,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25561,9 +25568,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25576,9 +25583,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25591,9 +25598,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25606,9 +25613,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25621,9 +25628,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25636,9 +25643,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25651,9 +25658,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25666,9 +25673,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25681,9 +25688,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25696,9 +25703,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25711,10 +25718,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25727,9 +25734,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25742,9 +25749,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25757,9 +25764,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25772,9 +25779,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25787,9 +25794,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25802,9 +25809,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25817,9 +25824,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25832,9 +25839,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25847,9 +25854,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25862,9 +25869,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25877,9 +25884,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25892,9 +25899,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25907,9 +25914,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25922,9 +25929,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -25937,9 +25944,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -27706,8 +27713,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -27740,9 +27747,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27758,9 +27765,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27776,9 +27783,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27794,9 +27801,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27812,9 +27819,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27830,9 +27837,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27848,9 +27855,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27866,9 +27873,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27884,9 +27891,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27902,9 +27909,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27920,9 +27927,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27938,9 +27945,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27956,9 +27963,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27974,9 +27981,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -27992,9 +27999,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28010,9 +28017,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 ; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28028,9 +28035,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28046,9 +28053,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28064,9 +28071,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28082,9 +28089,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28100,9 +28107,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28118,9 +28125,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28136,9 +28143,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28154,9 +28161,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28172,9 +28179,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28190,9 +28197,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28208,9 +28215,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28226,9 +28233,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28244,9 +28251,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 ; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28262,9 +28269,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28280,9 +28287,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -28329,8 +28336,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -28966,864 +28973,1020 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-NEXT: .LBB19_2: ; %cmp.true -; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s4, s27, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 -; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-NEXT: s_and_b32 s5, s24, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_lshl_b32 s4, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v30, v0, 16, v1 +; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v3 :: v_dual_add_nc_u32 v3, v6, v1 +; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v11 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v13, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v9, v8 ; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v12, v10 ; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v15 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v10, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v16 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_bfe_u32 v12, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v124, v3, 16, v5 +; GFX11-NEXT: v_lshl_or_b32 v112, v6, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v15 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v11, v12, vcc_lo +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshl_or_b32 v101, v8, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v137, v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX11-NEXT: v_lshl_or_b32 v91, v9, 16, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo +; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v82, v11, 16, v13 +; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v16 ; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-NEXT: v_lshl_or_b32 v74, v15, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v19 +; GFX11-NEXT: v_lshl_or_b32 v67, v17, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v21, v21, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v22, v22, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-NEXT: v_lshl_or_b32 v61, v19, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v23 +; GFX11-NEXT: v_lshl_or_b32 v56, v21, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s2 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v25, v25, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s1 ; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_lshl_or_b32 v52, v23, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v27, v27, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v27 +; GFX11-NEXT: v_lshl_or_b32 v49, v25, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v29, v29, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v29 +; GFX11-NEXT: v_dual_cndmask_b32 v28, v29, v31 :: v_dual_lshlrev_b32 v29, 16, v176 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v47, v27, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v176, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v177 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v177 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v177, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v178 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v179 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v179 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v179, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v180 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v180 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v181 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v181 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-NEXT: v_lshl_or_b32 v181, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v182 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v183 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-NEXT: v_lshl_or_b32 v183, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v170 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v170 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_lshl_or_b32 v170, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v171 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v171 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v171, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v172 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v172 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v172, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v173 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v173 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v173, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v174 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v174 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v174, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v175 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v175 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v175, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v185 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v185 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v185, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v184 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v184 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v184, v31, 16, v29 ; GFX11-NEXT: .LBB19_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB19_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 ; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 ; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 ; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 ; GFX11-NEXT: s_branch .LBB19_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -31442,8 +31605,8 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -31521,8 +31684,8 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -33348,8 +33511,8 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -33492,8 +33655,8 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -33553,107 +33716,109 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 ; GFX11-NEXT: .LBB23_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] @@ -33662,142 +33827,142 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v138, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v126, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v115, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v105, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v96, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v88, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v81, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v75, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v70, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v66, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v63, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB23_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB23_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -34402,9 +34567,10 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v18 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v3, s8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v2, s11 ; SI-NEXT: v_mov_b32_e32 v4, s13 ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_mov_b32_e32 v6, s41 @@ -34419,8 +34585,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, s18 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v2, s10, v2, 16 ; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 ; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 ; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 @@ -34452,6 +34617,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s79, s17, 16 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -34466,8 +34632,9 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v2, s9 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -34482,11 +34649,10 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_alignbit_b32 v3, s8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v2, s11 ; SI-NEXT: v_mov_b32_e32 v4, s13 ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_mov_b32_e32 v6, s41 @@ -34501,8 +34667,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, s18 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v2, s10, v2, 16 ; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 ; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 ; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 @@ -34686,25 +34851,25 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_and_b32 s4, s10, 0xffff ; SI-NEXT: s_lshl_b32 s5, s58, 16 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_and_b32 s4, s8, 0xffff ; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -34728,37 +34893,37 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: @@ -34785,8 +34950,8 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -34864,8 +35029,8 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -36514,8 +36679,8 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -36574,107 +36739,109 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 ; GFX11-NEXT: .LBB27_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] @@ -36683,142 +36850,142 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v138, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v126, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v115, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v105, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v96, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v88, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v81, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v75, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v70, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v66, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v63, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB27_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB27_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB27_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37065,8 +37232,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -37144,8 +37311,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -37223,8 +37390,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -37596,8 +37763,8 @@ define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -37675,8 +37842,8 @@ define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -37754,8 +37921,8 @@ define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -38127,8 +38294,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -38206,8 +38373,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -38285,8 +38452,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -38567,26 +38734,26 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_mov_b32_e32 v31, v17 -; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v33, v5 -; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -38631,26 +38798,26 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -38695,26 +38862,26 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -42523,32 +42690,30 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -42643,26 +42808,28 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -42744,9 +42911,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -42864,48 +43030,48 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB36_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v65, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 @@ -43167,27 +43333,26 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -43229,11 +43394,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v56, s16 +; SI-NEXT: v_mov_b32_e32 v43, s17 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v47, s17 -; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v45, s18 ; SI-NEXT: v_mov_b32_e32 v42, s19 -; SI-NEXT: v_mov_b32_e32 v40, s20 +; SI-NEXT: v_mov_b32_e32 v55, s20 ; SI-NEXT: v_mov_b32_e32 v53, s21 ; SI-NEXT: v_mov_b32_e32 v51, s22 ; SI-NEXT: v_mov_b32_e32 v48, s23 @@ -43345,11 +43510,265 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; SI-NEXT: v_alignbit_b32 v23, v53, v55, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v55, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v55, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v45, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v45, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v45, 8 +; SI-NEXT: v_alignbit_b32 v50, v43, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v43, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v43, v56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v43 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -43423,264 +43842,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 -; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 -; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 -; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 -; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 -; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 -; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 -; SI-NEXT: s_cbranch_execnz .LBB37_3 -; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 -; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill @@ -43689,24 +43854,24 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_add_f32_e32 v47, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v43, 1.0, v43 ; SI-NEXT: v_add_f32_e32 v56, 1.0, v56 ; SI-NEXT: v_add_f32_e32 v42, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v44, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v45, 1.0, v45 ; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_add_f32_e32 v55, 1.0, v55 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 -; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 -; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 -; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 -; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 -; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 -; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 +; SI-NEXT: v_alignbit_b32 v23, v53, v55, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v55, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v55, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v45, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v45, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v45, 8 +; SI-NEXT: v_alignbit_b32 v50, v43, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v43, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v43, v56, 8 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 @@ -43716,10 +43881,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 ; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v48 ; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 ; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 @@ -43728,9 +43893,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v43 ; SI-NEXT: .LBB37_3: ; %end ; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 @@ -43743,7 +43908,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v50, v54, v50 ; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v50, 0xff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_or_b32_e32 v21, v50, v21 @@ -43755,7 +43920,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v39 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v36 @@ -43779,7 +43944,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v26 @@ -43825,9 +43990,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_and_b32_e32 v19, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v46 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 @@ -43836,11 +44001,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v38 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 @@ -43853,9 +44018,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v44 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v52 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -43979,11 +44144,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -44286,10 +44451,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -46982,246 +47147,243 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-NEXT: v_writelane_b32 v75, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s40, v1 ; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-NEXT: v_writelane_b32 v75, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 ; GFX11-NEXT: v_readfirstlane_b32 s4, v5 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-NEXT: v_writelane_b32 v75, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s5, v6 ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-NEXT: v_writelane_b32 v75, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 ; GFX11-NEXT: v_readfirstlane_b32 s10, v11 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-NEXT: v_writelane_b32 v75, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s11, v12 ; GFX11-NEXT: v_readfirstlane_b32 s12, v13 ; GFX11-NEXT: v_readfirstlane_b32 s13, v14 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 -; GFX11-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-NEXT: v_writelane_b32 v75, s101, 5 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-NEXT: s_mov_b32 s82, 0 +; GFX11-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-NEXT: v_writelane_b32 v74, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s13, 24 -; GFX11-NEXT: s_lshr_b32 s36, s27, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: s_lshr_b32 s34, s27, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s38, s27, 8 -; GFX11-NEXT: s_lshr_b32 s37, s26, 16 -; GFX11-NEXT: s_lshr_b32 s39, s26, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: s_lshr_b32 s36, s27, 8 +; GFX11-NEXT: s_lshr_b32 s35, s26, 16 +; GFX11-NEXT: s_lshr_b32 s37, s26, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s13, 8 -; GFX11-NEXT: s_lshr_b32 s48, s25, 24 -; GFX11-NEXT: s_lshr_b32 s49, s25, 16 -; GFX11-NEXT: s_lshr_b32 s51, s25, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: s_lshr_b32 s38, s25, 24 +; GFX11-NEXT: s_lshr_b32 s39, s25, 16 +; GFX11-NEXT: s_lshr_b32 s49, s25, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s12, 16 -; GFX11-NEXT: s_lshr_b32 s50, s24, 16 -; GFX11-NEXT: s_lshr_b32 s52, s24, 8 -; GFX11-NEXT: s_lshr_b32 s53, s23, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: s_lshr_b32 s48, s24, 16 +; GFX11-NEXT: s_lshr_b32 s50, s24, 8 +; GFX11-NEXT: s_lshr_b32 s51, s23, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s12, 8 -; GFX11-NEXT: s_lshr_b32 s54, s23, 16 -; GFX11-NEXT: s_lshr_b32 s64, s23, 8 -; GFX11-NEXT: s_lshr_b32 s55, s22, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: s_lshr_b32 s52, s23, 16 +; GFX11-NEXT: s_lshr_b32 s54, s23, 8 +; GFX11-NEXT: s_lshr_b32 s53, s22, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s11, 24 -; GFX11-NEXT: s_lshr_b32 s65, s22, 8 -; GFX11-NEXT: s_lshr_b32 s66, s21, 24 -; GFX11-NEXT: s_lshr_b32 s67, s21, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: s_lshr_b32 s55, s22, 8 +; GFX11-NEXT: s_lshr_b32 s64, s21, 24 +; GFX11-NEXT: s_lshr_b32 s65, s21, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s11, 16 -; GFX11-NEXT: s_lshr_b32 s69, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s20, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: s_lshr_b32 s67, s21, 8 +; GFX11-NEXT: s_lshr_b32 s66, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s11, 8 -; GFX11-NEXT: s_lshr_b32 s71, s19, 24 -; GFX11-NEXT: s_lshr_b32 s80, s19, 16 -; GFX11-NEXT: s_lshr_b32 s82, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s80, s19, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s10, 16 -; GFX11-NEXT: s_lshr_b32 s81, s18, 16 -; GFX11-NEXT: s_lshr_b32 s83, s18, 8 -; GFX11-NEXT: s_lshr_b32 s84, s17, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: s_lshr_b32 s71, s18, 16 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s83, s17, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s10, 8 -; GFX11-NEXT: s_lshr_b32 s85, s17, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-NEXT: s_lshr_b32 s84, s17, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s9, 24 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: s_lshr_b32 s86, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-NEXT: s_lshr_b32 s86, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s87, s16, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s97, s3, 24 -; GFX11-NEXT: s_lshr_b32 s98, s3, 16 -; GFX11-NEXT: s_lshr_b32 s100, s3, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-NEXT: s_lshr_b32 s96, s3, 24 +; GFX11-NEXT: s_lshr_b32 s97, s3, 16 +; GFX11-NEXT: s_lshr_b32 s99, s3, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s9, 8 -; GFX11-NEXT: s_lshr_b32 s99, s2, 16 -; GFX11-NEXT: s_lshr_b32 s101, s2, 8 -; GFX11-NEXT: s_lshr_b32 s102, s1, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-NEXT: s_lshr_b32 s98, s2, 16 +; GFX11-NEXT: s_lshr_b32 s100, s2, 8 +; GFX11-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s8, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 16 -; GFX11-NEXT: s_lshr_b32 s34, s1, 8 -; GFX11-NEXT: s_lshr_b32 s104, s0, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-NEXT: s_lshr_b32 s102, s1, 16 +; GFX11-NEXT: s_lshr_b32 s104, s1, 8 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s8, 8 -; GFX11-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-NEXT: v_writelane_b32 v76, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s7, 24 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-NEXT: v_writelane_b32 v76, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s7, 16 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s7, 8 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-NEXT: v_writelane_b32 v76, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s6, 16 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-NEXT: v_writelane_b32 v76, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s6, 8 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-NEXT: v_writelane_b32 v76, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s5, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-NEXT: v_writelane_b32 v76, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-NEXT: v_writelane_b32 v76, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s5, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-NEXT: v_writelane_b32 v76, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s4, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-NEXT: v_writelane_b32 v76, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s4, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s15, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-NEXT: v_writelane_b32 v76, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s15, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-NEXT: v_writelane_b32 v76, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s15, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-NEXT: v_writelane_b32 v76, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-NEXT: v_writelane_b32 v76, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s14, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-NEXT: v_writelane_b32 v76, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s41, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-NEXT: v_writelane_b32 v76, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s41, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-NEXT: v_writelane_b32 v76, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s41, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s40, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s40, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s29, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s29, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s29, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s28, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s28, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v76, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s27, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s82 ; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 ; GFX11-NEXT: .LBB37_2: ; %cmp.true -; GFX11-NEXT: v_add_f32_e64 v22, s27, 1.0 -; GFX11-NEXT: v_add_f32_e64 v21, s26, 1.0 ; GFX11-NEXT: v_add_f32_e64 v24, s25, 1.0 ; GFX11-NEXT: v_add_f32_e64 v23, s24, 1.0 ; GFX11-NEXT: v_add_f32_e64 v29, s23, 1.0 @@ -47232,17 +47394,19 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_add_f32_e64 v34, s18, 1.0 ; GFX11-NEXT: v_add_f32_e64 v37, s17, 1.0 ; GFX11-NEXT: v_add_f32_e64 v36, s16, 1.0 -; GFX11-NEXT: v_add_f32_e64 v6, s9, 1.0 -; GFX11-NEXT: v_add_f32_e64 v5, s8, 1.0 -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[21:22] -; GFX11-NEXT: v_add_f32_e64 v53, s1, 1.0 -; GFX11-NEXT: v_add_f32_e64 v52, s0, 1.0 -; GFX11-NEXT: v_add_f32_e64 v49, s3, 1.0 -; GFX11-NEXT: v_add_f32_e64 v48, s2, 1.0 -; GFX11-NEXT: v_add_f32_e64 v18, s29, 1.0 -; GFX11-NEXT: v_add_f32_e64 v17, s28, 1.0 ; GFX11-NEXT: v_add_f32_e64 v14, s41, 1.0 ; GFX11-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v51, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v50, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v18, s29, 1.0 +; GFX11-NEXT: v_add_f32_e64 v17, s28, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[23:24] +; GFX11-NEXT: v_add_f32_e64 v49, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v48, s0, 1.0 +; GFX11-NEXT: v_add_f32_e64 v22, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v21, s26, 1.0 ; GFX11-NEXT: v_add_f32_e64 v12, s15, 1.0 ; GFX11-NEXT: v_add_f32_e64 v11, s14, 1.0 ; GFX11-NEXT: v_add_f32_e64 v10, s5, 1.0 @@ -47253,152 +47417,153 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_add_f32_e64 v3, s10, 1.0 ; GFX11-NEXT: v_add_f32_e64 v7, s6, 1.0 ; GFX11-NEXT: v_add_f32_e64 v9, s4, 1.0 -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[23:24] -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[30:31] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[30:31] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[36:37] ; GFX11-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[50:51] ; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] ; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] ; GFX11-NEXT: v_lshrrev_b64 v[32:33], 24, v[9:10] ; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 24, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 24, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 24, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 8, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 24, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v48 ; GFX11-NEXT: s_branch .LBB37_5 ; GFX11-NEXT: .LBB37_3: ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 +; GFX11-NEXT: s_mov_b32 s82, -1 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr99 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; implicit-def: $sgpr87 ; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr82 ; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr35 ; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr34 ; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr92 @@ -47489,337 +47654,301 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: s_branch .LBB37_2 ; GFX11-NEXT: .LBB37_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 -; GFX11-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v48, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s3 :: v_dual_mov_b32 v36, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v37, s17 :: v_dual_mov_b32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-NEXT: v_dual_mov_b32 v48, s0 :: v_dual_mov_b32 v49, s1 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-NEXT: v_mov_b32_e32 v146, s34 +; GFX11-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-NEXT: v_dual_mov_b32 v36, s16 :: v_dual_mov_b32 v37, s17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 1 ; GFX11-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 ; GFX11-NEXT: v_dual_mov_b32 v30, s20 :: v_dual_mov_b32 v31, s21 -; GFX11-NEXT: v_mov_b32_e32 v146, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_mov_b32_e32 v145, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 ; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 ; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v145, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-NEXT: v_mov_b32_e32 v144, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 ; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 ; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 -; GFX11-NEXT: v_mov_b32_e32 v144, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-NEXT: v_mov_b32_e32 v135, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 ; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 ; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v134, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: v_dual_mov_b32 v9, s4 :: v_dual_mov_b32 v10, s5 ; GFX11-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v8, s7 -; GFX11-NEXT: v_mov_b32_e32 v135, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 ; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 ; GFX11-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v133, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 -; GFX11-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v2, s13 -; GFX11-NEXT: v_dual_mov_b32 v74, s35 :: v_dual_mov_b32 v73, s104 ; GFX11-NEXT: v_mov_b32_e32 v132, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: v_dual_mov_b32 v72, s34 :: v_dual_mov_b32 v63, s103 -; GFX11-NEXT: v_dual_mov_b32 v62, s102 :: v_dual_mov_b32 v61, s101 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v2, s13 +; GFX11-NEXT: v_dual_mov_b32 v73, vcc_hi :: v_dual_mov_b32 v72, s103 ; GFX11-NEXT: v_mov_b32_e32 v131, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 9 -; GFX11-NEXT: v_dual_mov_b32 v60, s99 :: v_dual_mov_b32 v59, s100 -; GFX11-NEXT: v_dual_mov_b32 v58, s98 :: v_dual_mov_b32 v57, s97 -; GFX11-NEXT: v_mov_b32_e32 v129, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 10 -; GFX11-NEXT: v_dual_mov_b32 v56, s96 :: v_dual_mov_b32 v47, s86 -; GFX11-NEXT: v_dual_mov_b32 v46, s87 :: v_dual_mov_b32 v45, s85 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_dual_mov_b32 v63, s104 :: v_dual_mov_b32 v62, s102 +; GFX11-NEXT: v_dual_mov_b32 v61, s101 :: v_dual_mov_b32 v60, s100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v130, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 11 -; GFX11-NEXT: v_dual_mov_b32 v44, s84 :: v_dual_mov_b32 v43, s83 -; GFX11-NEXT: v_dual_mov_b32 v42, s81 :: v_dual_mov_b32 v41, s82 +; GFX11-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-NEXT: v_dual_mov_b32 v59, s98 :: v_dual_mov_b32 v58, s99 +; GFX11-NEXT: v_dual_mov_b32 v57, s97 :: v_dual_mov_b32 v56, s96 ; GFX11-NEXT: v_mov_b32_e32 v128, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 12 -; GFX11-NEXT: v_dual_mov_b32 v40, s80 :: v_dual_mov_b32 v183, s71 -; GFX11-NEXT: v_dual_mov_b32 v182, s70 :: v_dual_mov_b32 v181, s68 +; GFX11-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-NEXT: v_dual_mov_b32 v47, s87 :: v_dual_mov_b32 v46, s85 +; GFX11-NEXT: v_dual_mov_b32 v45, s86 :: v_dual_mov_b32 v44, s84 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v129, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-NEXT: v_dual_mov_b32 v43, s83 :: v_dual_mov_b32 v42, s81 +; GFX11-NEXT: v_dual_mov_b32 v41, s71 :: v_dual_mov_b32 v40, s80 ; GFX11-NEXT: v_mov_b32_e32 v119, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 13 -; GFX11-NEXT: v_dual_mov_b32 v180, s69 :: v_dual_mov_b32 v179, s67 -; GFX11-NEXT: v_dual_mov_b32 v178, s66 :: v_dual_mov_b32 v177, s65 -; GFX11-NEXT: v_mov_b32_e32 v118, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 14 -; GFX11-NEXT: v_dual_mov_b32 v176, s55 :: v_dual_mov_b32 v167, s64 -; GFX11-NEXT: v_dual_mov_b32 v166, s54 :: v_dual_mov_b32 v165, s53 +; GFX11-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-NEXT: v_dual_mov_b32 v183, s70 :: v_dual_mov_b32 v182, s69 +; GFX11-NEXT: v_dual_mov_b32 v181, s68 :: v_dual_mov_b32 v180, s66 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v116, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 15 -; GFX11-NEXT: v_dual_mov_b32 v164, s52 :: v_dual_mov_b32 v163, s50 -; GFX11-NEXT: v_dual_mov_b32 v162, s51 :: v_dual_mov_b32 v161, s49 +; GFX11-NEXT: v_mov_b32_e32 v118, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-NEXT: v_dual_mov_b32 v179, s67 :: v_dual_mov_b32 v178, s65 +; GFX11-NEXT: v_dual_mov_b32 v177, s64 :: v_dual_mov_b32 v176, s55 ; GFX11-NEXT: v_mov_b32_e32 v117, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 16 -; GFX11-NEXT: v_dual_mov_b32 v160, s48 :: v_dual_mov_b32 v151, s39 -; GFX11-NEXT: v_dual_mov_b32 v150, s37 :: v_dual_mov_b32 v149, s38 +; GFX11-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-NEXT: v_dual_mov_b32 v167, s53 :: v_dual_mov_b32 v166, s54 +; GFX11-NEXT: v_dual_mov_b32 v165, s52 :: v_dual_mov_b32 v164, s51 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v115, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 17 -; GFX11-NEXT: v_dual_mov_b32 v15, s62 :: v_dual_mov_b32 v38, s88 -; GFX11-NEXT: v_dual_mov_b32 v19, s72 :: v_dual_mov_b32 v50, s90 -; GFX11-NEXT: v_mov_b32_e32 v114, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 18 -; GFX11-NEXT: v_dual_mov_b32 v25, s74 :: v_dual_mov_b32 v54, s92 -; GFX11-NEXT: v_dual_mov_b32 v64, s94 :: v_dual_mov_b32 v65, s30 +; GFX11-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-NEXT: v_dual_mov_b32 v163, s50 :: v_dual_mov_b32 v162, s48 +; GFX11-NEXT: v_dual_mov_b32 v161, s49 :: v_dual_mov_b32 v160, s39 +; GFX11-NEXT: v_mov_b32_e32 v116, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-NEXT: v_dual_mov_b32 v151, s38 :: v_dual_mov_b32 v150, s37 +; GFX11-NEXT: v_dual_mov_b32 v149, s35 :: v_dual_mov_b32 v148, s36 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v114, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-NEXT: v_dual_mov_b32 v15, s62 :: v_dual_mov_b32 v38, s88 +; GFX11-NEXT: v_dual_mov_b32 v19, s72 :: v_dual_mov_b32 v52, s90 ; GFX11-NEXT: v_mov_b32_e32 v113, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 19 -; GFX11-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v67, s58 -; GFX11-NEXT: v_dual_mov_b32 v68, s56 :: v_dual_mov_b32 v69, s46 -; GFX11-NEXT: v_mov_b32_e32 v103, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 20 -; GFX11-NEXT: v_mov_b32_e32 v70, s44 -; GFX11-NEXT: v_mov_b32_e32 v80, s42 -; GFX11-NEXT: v_mov_b32_e32 v26, s76 -; GFX11-NEXT: v_mov_b32_e32 v32, s78 +; GFX11-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-NEXT: v_dual_mov_b32 v25, s74 :: v_dual_mov_b32 v54, s94 +; GFX11-NEXT: v_dual_mov_b32 v53, s92 :: v_dual_mov_b32 v64, s30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v112, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 21 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-NEXT: v_dual_mov_b32 v65, s60 :: v_dual_mov_b32 v66, s58 +; GFX11-NEXT: v_dual_mov_b32 v67, s56 :: v_dual_mov_b32 v68, s46 ; GFX11-NEXT: v_mov_b32_e32 v102, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-NEXT: v_dual_mov_b32 v69, s44 :: v_dual_mov_b32 v70, s42 +; GFX11-NEXT: v_mov_b32_e32 v26, s76 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v32, s78 :: v_dual_mov_b32 v103, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 21 ; GFX11-NEXT: v_mov_b32_e32 v101, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-NEXT: v_readlane_b32 s0, v76, 22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v100, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 24 -; GFX11-NEXT: v_mov_b32_e32 v98, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 23 ; GFX11-NEXT: v_mov_b32_e32 v99, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-NEXT: v_readlane_b32 s0, v76, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v97, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-NEXT: v_readlane_b32 s0, v76, 25 +; GFX11-NEXT: v_mov_b32_e32 v98, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 26 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v96, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-NEXT: v_readlane_b32 s0, v76, 27 ; GFX11-NEXT: v_mov_b32_e32 v87, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-NEXT: v_readlane_b32 s0, v76, 28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v85, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 30 ; GFX11-NEXT: v_mov_b32_e32 v86, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 31 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 29 ; GFX11-NEXT: v_mov_b32_e32 v84, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v85, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 31 ; GFX11-NEXT: v_mov_b32_e32 v83, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v82, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 -; GFX11-NEXT: v_mov_b32_e32 v51, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-NEXT: v_mov_b32_e32 v81, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v55, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_mov_b32_e32 v39, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-NEXT: v_mov_b32_e32 v80, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v39, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 5 ; GFX11-NEXT: v_mov_b32_e32 v33, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_mov_b32_e32 v27, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-NEXT: v_readlane_b32 s0, v77, 6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v27, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v20, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v16, s0 ; GFX11-NEXT: .LBB37_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v80 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v62 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v73 ; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v80 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 ; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v71, v80, v71 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v72 -; GFX11-NEXT: v_or_b32_e32 v70, v60, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v61 +; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v71 +; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v60 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v71 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v80 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v63 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-NEXT: v_or_b32_e32 v70, v71, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v63 +; GFX11-NEXT: v_or_b32_e32 v50, v50, v60 +; GFX11-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-NEXT: v_or_b32_e32 v49, v49, v71 +; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v62 +; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v70 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v59 +; GFX11-NEXT: v_or_b32_e32 v71, v71, v61 +; GFX11-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-NEXT: v_and_b32_e32 v46, 0xff, v46 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-NEXT: v_or_b32_e32 v60, v52, v71 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_or_b32_e32 v80, v80, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v61 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v59 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v80 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v81 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v57 +; GFX11-NEXT: v_or_b32_e32 v69, v70, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v71 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-NEXT: v_or_b32_e32 v68, v46, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-NEXT: v_or_b32_e32 v49, v49, v71 -; GFX11-NEXT: v_or_b32_e32 v61, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v70 -; GFX11-NEXT: v_or_b32_e32 v53, v80, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v47 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 -; GFX11-NEXT: v_or_b32_e32 v62, v48, v52 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; GFX11-NEXT: v_or_b32_e32 v36, v36, v70 -; GFX11-NEXT: v_or_b32_e32 v52, v71, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v45 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v44 -; GFX11-NEXT: v_or_b32_e32 v63, v48, v49 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v70 +; GFX11-NEXT: v_or_b32_e32 v70, v57, v56 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v47 +; GFX11-NEXT: v_or_b32_e32 v50, v50, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; GFX11-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v70 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v71 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v45 +; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v44 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v69 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 -; GFX11-NEXT: v_or_b32_e32 v37, v37, v53 -; GFX11-NEXT: v_or_b32_e32 v49, v69, v70 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v42 -; GFX11-NEXT: v_or_b32_e32 v34, v36, v48 +; GFX11-NEXT: v_or_b32_e32 v37, v37, v70 +; GFX11-NEXT: v_or_b32_e32 v69, v71, v43 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v34, v36, v68 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 -; GFX11-NEXT: v_or_b32_e32 v49, v69, v68 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v40 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v69 +; GFX11-NEXT: v_or_b32_e32 v68, v70, v71 +; GFX11-NEXT: v_or_b32_e32 v67, v41, v67 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v40 +; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v183 +; GFX11-NEXT: v_lshlrev_b32_e32 v182, 8, v182 ; GFX11-NEXT: v_or_b32_e32 v35, v36, v37 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 -; GFX11-NEXT: v_or_b32_e32 v49, v68, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v182 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v181 -; GFX11-NEXT: v_or_b32_e32 v36, v36, v37 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v30, v30, v52 -; GFX11-NEXT: v_or_b32_e32 v52, v53, v67 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v176 -; GFX11-NEXT: v_or_b32_e32 v37, v48, v49 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v179 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v178 -; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v180 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v53 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_or_b32_e32 v49, v49, v52 -; GFX11-NEXT: v_or_b32_e32 v52, v67, v66 -; GFX11-NEXT: v_or_b32_e32 v31, v31, v68 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v166 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v165 -; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_or_b32_e32 v29, v29, v68 -; GFX11-NEXT: v_or_b32_e32 v53, v69, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v28, v30, v48 -; GFX11-NEXT: v_or_b32_e32 v30, v66, v52 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v67 +; GFX11-NEXT: v_or_b32_e32 v67, v69, v70 +; GFX11-NEXT: v_or_b32_e32 v68, v71, v182 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 -; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off -; GFX11-NEXT: scratch_store_b128 v0, v[34:37], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v29, v31, v49 ; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v37 +; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v161 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v160 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v52 -; GFX11-NEXT: v_or_b32_e32 v31, v67, v53 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v37, v67, v68 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off +; GFX11-NEXT: scratch_store_b128 v0, v[34:37], off offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v150 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v160 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v151 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 ; GFX11-NEXT: v_or_b32_e32 v23, v23, v34 ; GFX11-NEXT: v_or_b32_e32 v34, v35, v36 ; GFX11-NEXT: v_or_b32_e32 v24, v24, v37 ; GFX11-NEXT: v_or_b32_e32 v35, v48, v49 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v21 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v150 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v147 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v148 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v144 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v147 ; GFX11-NEXT: v_or_b32_e32 v21, v21, v37 ; GFX11-NEXT: v_or_b32_e32 v22, v22, v48 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v53 -; GFX11-NEXT: v_or_b32_e32 v48, v64, v54 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v51 +; GFX11-NEXT: v_or_b32_e32 v48, v54, v53 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_or_b32_e32 v37, v49, v52 +; GFX11-NEXT: v_or_b32_e32 v37, v49, v50 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v21 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; GFX11-NEXT: v_or_b32_e32 v21, v23, v34 ; GFX11-NEXT: v_or_b32_e32 v22, v24, v35 ; GFX11-NEXT: v_or_b32_e32 v23, v36, v49 ; GFX11-NEXT: v_or_b32_e32 v34, v17, v48 ; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v135 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v50 -; GFX11-NEXT: v_or_b32_e32 v24, v52, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v52 +; GFX11-NEXT: v_or_b32_e32 v24, v50, v37 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v133 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v132 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX11-NEXT: v_or_b32_e32 v18, v35, v36 ; GFX11-NEXT: v_or_b32_e32 v35, v48, v49 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v130 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v130 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v129 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v129 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v128 ; GFX11-NEXT: v_or_b32_e32 v14, v14, v50 ; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v118 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v38 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v116 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v117 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v117 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v116 ; GFX11-NEXT: v_or_b32_e32 v35, v35, v37 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -47828,7 +47957,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 ; GFX11-NEXT: v_or_b32_e32 v37, v49, v38 ; GFX11-NEXT: v_or_b32_e32 v12, v12, v50 -; GFX11-NEXT: v_or_b32_e32 v38, v52, v53 +; GFX11-NEXT: v_or_b32_e32 v38, v51, v52 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v35 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v37 @@ -47838,32 +47967,32 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v36, v13, v36 ; GFX11-NEXT: v_or_b32_e32 v37, v14, v48 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v113 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v32 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v113 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v112 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v101 ; GFX11-NEXT: v_or_b32_e32 v11, v11, v49 ; GFX11-NEXT: v_or_b32_e32 v12, v12, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v103 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v103 ; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-NEXT: v_or_b32_e32 v13, v14, v17 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v48 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v100 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v87 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; GFX11-NEXT: v_or_b32_e32 v14, v32, v38 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v100 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v99 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v98 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v48 ; GFX11-NEXT: v_or_b32_e32 v25, v49, v25 @@ -47877,6 +48006,12 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v179 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-NEXT: v_or_b32_e32 v13, v9, v13 @@ -47884,25 +48019,43 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX11-NEXT: v_or_b32_e32 v9, v5, v25 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v85 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v86 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v82 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v69 +; GFX11-NEXT: v_or_b32_e32 v66, v70, v66 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v71 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v177 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v65 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v166 +; GFX11-NEXT: v_and_b32_e32 v165, 0xff, v165 +; GFX11-NEXT: v_lshlrev_b32_e32 v164, 8, v164 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v18 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v83 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v81 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: v_or_b32_e32 v6, v10, v17 ; GFX11-NEXT: v_or_b32_e32 v10, v25, v19 +; GFX11-NEXT: v_or_b32_e32 v67, v67, v68 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v69 +; GFX11-NEXT: v_or_b32_e32 v65, v70, v65 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v71 +; GFX11-NEXT: v_or_b32_e32 v68, v165, v164 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v18 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v26 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v55 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v39 ; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v33 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15 @@ -47910,6 +48063,14 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v27 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v69, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-NEXT: v_and_b32_e32 v70, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 ; GFX11-NEXT: v_or_b32_e32 v15, v25, v15 @@ -47917,6 +48078,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v16, v20, v16 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_or_b32_e32 v28, v30, v66 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v67 +; GFX11-NEXT: v_or_b32_e32 v30, v69, v65 +; GFX11-NEXT: v_or_b32_e32 v31, v70, v68 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 @@ -47936,73 +48101,72 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-NEXT: v_readlane_b32 s30, v74, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -57262,10 +57426,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248 @@ -57298,7 +57462,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124 @@ -57359,25 +57523,25 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) @@ -57418,253 +57582,262 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff @@ -57692,18 +57865,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_3 @@ -57882,7 +58045,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -57956,9 +58119,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -57992,13 +58155,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -58203,10 +58366,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:232 ; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 @@ -58239,7 +58402,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 ; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:140 ; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 @@ -58300,25 +58463,25 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) @@ -58359,253 +58522,262 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff @@ -58633,18 +58805,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_3 @@ -58823,7 +58985,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -58897,9 +59059,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -58933,13 +59095,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -60675,8 +60837,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -60754,8 +60916,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -61608,9 +61770,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61626,9 +61788,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61644,9 +61806,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61662,9 +61824,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61680,9 +61842,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61698,9 +61860,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61716,9 +61878,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61734,9 +61896,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61752,9 +61914,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61770,9 +61932,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61788,9 +61950,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61806,9 +61968,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61824,9 +61986,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61842,9 +62004,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61860,9 +62022,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61878,10 +62040,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61897,9 +62059,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61915,9 +62077,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61933,9 +62095,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61951,9 +62113,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61969,9 +62131,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -61987,9 +62149,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62005,9 +62167,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62023,9 +62185,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62041,9 +62203,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62059,9 +62221,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62077,9 +62239,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62095,9 +62257,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62113,9 +62275,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62131,9 +62293,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62149,9 +62311,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -62201,9 +62363,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62216,9 +62378,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62231,9 +62393,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62246,9 +62408,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62261,9 +62423,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62276,9 +62438,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62291,9 +62453,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62306,9 +62468,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62321,9 +62483,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62336,9 +62498,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62351,9 +62513,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62366,9 +62528,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62381,9 +62543,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62396,9 +62558,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62411,9 +62573,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62426,10 +62588,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62442,9 +62604,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62457,9 +62619,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62472,9 +62634,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62487,9 +62649,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62502,9 +62664,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62517,9 +62679,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62532,9 +62694,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62547,9 +62709,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62562,9 +62724,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62577,9 +62739,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62592,9 +62754,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62607,9 +62769,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62622,9 +62784,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62637,9 +62799,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -62652,9 +62814,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -64421,8 +64583,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -64455,9 +64617,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64473,9 +64635,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64491,9 +64653,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64509,9 +64671,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64527,9 +64689,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64545,9 +64707,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64563,9 +64725,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64581,9 +64743,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64599,9 +64761,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64617,9 +64779,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64635,9 +64797,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64653,9 +64815,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64671,9 +64833,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64689,9 +64851,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64707,9 +64869,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64725,9 +64887,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 ; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64743,9 +64905,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64761,9 +64923,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64779,9 +64941,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64797,9 +64959,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64815,9 +64977,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64833,9 +64995,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64851,9 +65013,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64869,9 +65031,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64887,9 +65049,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64905,9 +65067,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64923,9 +65085,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64941,9 +65103,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64959,9 +65121,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 ; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64977,9 +65139,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -64995,9 +65157,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -65044,8 +65206,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -65681,864 +65843,1020 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-NEXT: .LBB43_2: ; %cmp.true -; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s4, s27, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 -; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-NEXT: s_and_b32 s5, s24, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_lshl_b32 s4, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v30, v0, 16, v1 +; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v3 :: v_dual_add_nc_u32 v3, v6, v1 +; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v11 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v13, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v9, v8 ; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v12, v10 ; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v15 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v10, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v16 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_bfe_u32 v12, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v124, v3, 16, v5 +; GFX11-NEXT: v_lshl_or_b32 v112, v6, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v15 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v11, v12, vcc_lo +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshl_or_b32 v101, v8, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v137, v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX11-NEXT: v_lshl_or_b32 v91, v9, 16, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo +; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v82, v11, 16, v13 +; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v16 ; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-NEXT: v_lshl_or_b32 v74, v15, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v19 +; GFX11-NEXT: v_lshl_or_b32 v67, v17, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v21, v21, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v22, v22, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-NEXT: v_lshl_or_b32 v61, v19, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v23 +; GFX11-NEXT: v_lshl_or_b32 v56, v21, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s2 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v25, v25, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s1 ; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_lshl_or_b32 v52, v23, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v27, v27, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v27 +; GFX11-NEXT: v_lshl_or_b32 v49, v25, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v29, v29, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v29 +; GFX11-NEXT: v_dual_cndmask_b32 v28, v29, v31 :: v_dual_lshlrev_b32 v29, 16, v176 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v47, v27, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v176, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v177 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v177 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v177, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v178 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v179 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v179 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v179, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v180 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v180 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v181 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v181 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-NEXT: v_lshl_or_b32 v181, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v182 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v183 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-NEXT: v_lshl_or_b32 v183, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v170 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v170 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_lshl_or_b32 v170, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v171 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v171 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v171, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v172 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v172 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v172, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v173 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v173 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v173, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v174 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v174 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v174, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v175 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v175 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v175, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v185 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v185 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v185, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v184 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v184 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v184, v31, 16, v29 ; GFX11-NEXT: .LBB43_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB43_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 ; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 ; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 ; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 ; GFX11-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -68146,8 +68464,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -68225,8 +68543,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -70036,8 +70354,8 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -70180,8 +70498,8 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -70241,107 +70559,109 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-NEXT: .LBB47_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] @@ -70350,142 +70670,142 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v138, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v126, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v115, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v105, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v96, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v88, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v81, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v75, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v70, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v66, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v63, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB47_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB47_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -71053,12 +71373,12 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v36, s16 +; SI-NEXT: v_mov_b32_e32 v33, s16 +; SI-NEXT: v_mov_b32_e32 v30, s17 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v33, s18 -; SI-NEXT: v_mov_b32_e32 v32, s19 -; SI-NEXT: v_mov_b32_e32 v31, s20 +; SI-NEXT: v_mov_b32_e32 v35, s18 +; SI-NEXT: v_mov_b32_e32 v34, s19 +; SI-NEXT: v_mov_b32_e32 v32, s20 ; SI-NEXT: v_mov_b32_e32 v29, s21 ; SI-NEXT: v_mov_b32_e32 v28, s22 ; SI-NEXT: v_mov_b32_e32 v26, s23 @@ -71089,8 +71409,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 @@ -71100,11 +71420,11 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 ; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 ; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v32, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: v_alignbit_b32 v58, v34, v35, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_alignbit_b32 v60, v30, v33, 16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 @@ -71122,19 +71442,19 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v34 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -71162,8 +71482,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 @@ -71173,11 +71493,11 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 ; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 ; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v32, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: v_alignbit_b32 v58, v34, v35, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_alignbit_b32 v60, v30, v33, 16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 @@ -71195,38 +71515,38 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v34 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v33 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v36, v36, v60 -; SI-NEXT: v_or_b32_e32 v23, v35, v23 -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v23, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v58 -; SI-NEXT: v_or_b32_e32 v23, v23, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v33, v33, v60 +; SI-NEXT: v_or_b32_e32 v23, v30, v23 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v30, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v31, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v30, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 @@ -71343,7 +71663,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -71355,7 +71675,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -71435,9 +71755,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -71469,8 +71789,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -71548,8 +71868,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -73182,8 +73502,8 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -73242,107 +73562,109 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-NEXT: .LBB51_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] @@ -73351,142 +73673,142 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v138, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v126, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v115, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v105, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v96, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v88, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v81, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v75, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v70, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v66, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v63, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB51_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB51_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -73757,8 +74079,8 @@ define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -73836,8 +74158,8 @@ define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -73915,8 +74237,8 @@ define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -74221,26 +74543,26 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_mov_b32_e32 v31, v17 -; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v33, v5 -; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 @@ -74285,26 +74607,26 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -74349,26 +74671,26 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -78202,32 +78524,30 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -78322,26 +78642,28 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -78423,9 +78745,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: .LBB56_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -78568,48 +78889,48 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB56_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v65, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 @@ -78871,27 +79192,26 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -78966,20 +79286,23 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s71, 23 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v19, s43 +; SI-NEXT: v_alignbit_b32 v20, s42, v19, 24 +; SI-NEXT: v_alignbit_b32 v22, s42, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, s42, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s45 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v14, s15 ; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_alignbit_b32 v28, s44, v19, 24 +; SI-NEXT: v_alignbit_b32 v29, s44, v19, 16 +; SI-NEXT: v_alignbit_b32 v30, s44, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s47 ; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v27, s26 ; SI-NEXT: v_mov_b32_e32 v35, s24 ; SI-NEXT: v_mov_b32_e32 v39, s22 ; SI-NEXT: v_mov_b32_e32 v50, s20 @@ -78992,32 +79315,29 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v9, s10, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s10, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s10, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s12, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s12, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s14, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s14, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s14, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 ; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v33, s46, v19, 24 +; SI-NEXT: v_alignbit_b32 v34, s46, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, s46, v19, 8 +; SI-NEXT: v_alignbit_b32 v19, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v21, s29, v23, 16 ; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v25, s27, v27, 24 +; SI-NEXT: v_alignbit_b32 v26, s27, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, s27, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v32, s25, v35, 16 ; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 ; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 ; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 @@ -79111,22 +79431,25 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s10, s10, 0 ; SI-NEXT: s_add_u32 s9, s9, 3 ; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: v_mov_b32_e32 v19, s43 ; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_alignbit_b32 v20, s42, v19, 24 +; SI-NEXT: v_alignbit_b32 v22, s42, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, s42, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s45 ; SI-NEXT: s_addc_u32 s6, s6, 0 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v14, s15 ; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_alignbit_b32 v28, s44, v19, 24 +; SI-NEXT: v_alignbit_b32 v29, s44, v19, 16 +; SI-NEXT: v_alignbit_b32 v30, s44, v19, 8 +; SI-NEXT: v_mov_b32_e32 v19, s47 ; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v27, s26 ; SI-NEXT: v_mov_b32_e32 v35, s24 ; SI-NEXT: v_mov_b32_e32 v39, s22 ; SI-NEXT: v_mov_b32_e32 v50, s20 @@ -79139,32 +79462,29 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v9, s10, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s10, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s10, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s12, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s12, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s14, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s14, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s14, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 ; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v33, s46, v19, 24 +; SI-NEXT: v_alignbit_b32 v34, s46, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, s46, v19, 8 +; SI-NEXT: v_alignbit_b32 v19, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v21, s29, v23, 16 ; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v25, s27, v27, 24 +; SI-NEXT: v_alignbit_b32 v26, s27, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, s27, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v32, s25, v35, 16 ; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 ; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 ; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 @@ -79325,145 +79645,146 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v35, s4, v35 ; SI-NEXT: s_and_b32 s4, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s49, 24 ; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v35, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v35, v31 +; SI-NEXT: v_add_i32_e32 v32, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v32, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v27, s4, v27 ; SI-NEXT: s_and_b32 s4, s27, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s38, 24 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v26, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; SI-NEXT: v_or_b32_e32 v23, s4, v23 ; SI-NEXT: s_and_b32 s4, s29, 0xff ; SI-NEXT: s_lshl_b32 s5, s37, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s35, 24 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v19, v23, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v21, s4 +; SI-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s46, 0xff ; SI-NEXT: s_lshl_b32 s5, s34, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s31, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v33 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s30, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v21, s4 +; SI-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s44, 0xff ; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v29 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v28 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s93, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v21, s4 +; SI-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v24 +; SI-NEXT: v_or_b32_e32 v19, s4, v19 ; SI-NEXT: s_and_b32 s4, s42, 0xff ; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s91, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s90, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff @@ -79491,75 +79812,76 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 ; SI-NEXT: s_and_b32 s4, s14, 0xff ; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s14, s76, 24 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s14, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x5c, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s12, s73, 24 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s12, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x60, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: s_and_b32 s4, s10, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s63, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s10, s62, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff @@ -79647,41 +79969,59 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr35 @@ -79706,34 +80046,16 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -80313,17 +80635,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 50 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 50 ; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: v_readlane_b32 s17, v21, 49 ; VI-NEXT: v_readlane_b32 s18, v21, 48 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen @@ -80858,108 +81180,107 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 ; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 1 ; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 ; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 ; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 ; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 ; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 ; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 ; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 ; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 ; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 ; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 ; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 ; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 ; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 ; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 ; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 ; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 ; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 ; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 ; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 ; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 ; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 ; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 ; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 ; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 ; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 ; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 ; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 ; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 ; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 ; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 ; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 ; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 ; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 ; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 ; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 ; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 ; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 ; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 ; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 ; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 ; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 ; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 ; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 ; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 ; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 ; GFX9-NEXT: s_lshr_b32 s84, s27, 8 ; GFX9-NEXT: s_lshr_b32 s85, s26, 16 ; GFX9-NEXT: s_lshr_b32 s86, s26, 8 @@ -80987,8 +81308,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s70, s17, 16 ; GFX9-NEXT: s_lshr_b32 s71, s17, 8 ; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 @@ -81039,108 +81360,107 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_add_u32 s4, s4, 3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 ; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 1 ; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 ; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 ; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 ; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 ; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 ; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 ; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 ; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 ; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 ; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 ; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 ; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 ; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 ; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 ; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 ; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 ; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 ; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 ; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 ; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 ; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 ; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 ; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 ; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 ; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 ; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 ; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 ; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 ; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 ; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 ; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 ; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 ; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 ; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 ; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 ; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 ; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 ; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 ; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 ; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 ; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 ; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 ; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 ; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 ; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 ; GFX9-NEXT: s_lshr_b32 s84, s27, 8 ; GFX9-NEXT: s_lshr_b32 s85, s26, 16 ; GFX9-NEXT: s_lshr_b32 s86, s26, 8 @@ -81168,8 +81488,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s70, s17, 16 ; GFX9-NEXT: s_lshr_b32 s71, s17, 8 ; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 @@ -81186,22 +81506,22 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 ; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 ; GFX9-NEXT: .LBB57_3: ; %end -; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_lshl_b32 s47, s81, 8 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: s_lshl_b32 s46, s36, 8 -; GFX9-NEXT: s_and_b32 s47, s80, 0xff -; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_or_b32 s16, s16, s47 +; GFX9-NEXT: s_lshl_b32 s47, s36, 8 +; GFX9-NEXT: s_and_b32 s57, s80, 0xff +; GFX9-NEXT: s_or_b32 s47, s57, s47 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s46, s46, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s47, s47, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s47 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s71, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: s_and_b32 s17, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s46, s69, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_lshl_b32 s47, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s47 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -81299,16 +81619,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s84, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_and_b32 s17, s83, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s82, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: v_readlane_b32 s16, v21, 49 +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_and_b32 s17, s28, 0xff -; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: v_readlane_b32 s18, v21, 48 ; GFX9-NEXT: s_or_b32 s16, s17, s16 ; GFX9-NEXT: s_lshl_b32 s17, s88, 8 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff @@ -81316,20 +81637,20 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_readlane_b32 s17, v21, 47 ; GFX9-NEXT: v_mov_b32_e32 v13, s16 ; GFX9-NEXT: s_and_b32 s16, s29, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 48 -; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: v_readlane_b32 s18, v21, 45 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -81347,75 +81668,75 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s16, s44, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s78, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: v_readlane_b32 s17, v21, 42 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s45, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 43 -; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: v_readlane_b32 s18, v21, 40 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s42, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s76, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: v_readlane_b32 s17, v21, 37 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s43, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 38 -; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: v_readlane_b32 s18, v21, 35 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s40, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s74, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: v_readlane_b32 s17, v21, 32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s41, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 33 -; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: v_readlane_b32 s17, v21, 31 +; GFX9-NEXT: v_readlane_b32 s18, v21, 30 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -81424,11 +81745,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: v_readlane_b32 s16, v21, 29 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: v_readlane_b32 s16, v21, 28 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s72, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -81438,11 +81759,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: v_readlane_b32 s15, v21, 27 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v21, 28 -; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: v_readlane_b32 s15, v21, 26 +; GFX9-NEXT: v_readlane_b32 s16, v21, 25 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -81451,11 +81772,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: v_readlane_b32 s14, v21, 24 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: v_readlane_b32 s14, v21, 23 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s15, s62, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 @@ -81465,11 +81786,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: v_readlane_b32 s13, v21, 22 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v21, 23 -; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: v_readlane_b32 s13, v21, 21 +; GFX9-NEXT: v_readlane_b32 s14, v21, 20 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -81478,11 +81799,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: v_readlane_b32 s12, v21, 19 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: v_readlane_b32 s12, v21, 18 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s13, s60, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 @@ -81492,11 +81813,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: v_readlane_b32 s11, v21, 17 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v21, 18 -; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: v_readlane_b32 s11, v21, 16 +; GFX9-NEXT: v_readlane_b32 s12, v21, 15 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -81505,11 +81826,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: v_readlane_b32 s10, v21, 14 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: v_readlane_b32 s10, v21, 13 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s58, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 @@ -81519,11 +81840,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: v_readlane_b32 s9, v21, 12 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v21, 13 -; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: v_readlane_b32 s9, v21, 11 +; GFX9-NEXT: v_readlane_b32 s10, v21, 10 ; GFX9-NEXT: s_and_b32 s9, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 @@ -81532,11 +81853,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: v_readlane_b32 s8, v21, 9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: v_readlane_b32 s8, v21, 8 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s56, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 @@ -81546,11 +81867,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: v_readlane_b32 s7, v21, 7 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v21, 8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 6 +; GFX9-NEXT: v_readlane_b32 s8, v21, 5 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 @@ -81559,14 +81880,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s6, v21, 3 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_lshl_b32 s7, s46, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -81574,11 +81894,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: v_readlane_b32 s5, v21, 2 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v21, 3 -; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: v_readlane_b32 s5, v21, 1 +; GFX9-NEXT: v_readlane_b32 s6, v21, 0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -81587,7 +81907,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s99, v20, 35 ; GFX9-NEXT: v_readlane_b32 s98, v20, 34 @@ -81632,16 +81951,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB57_4: -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: v_writelane_b32 v21, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr70 @@ -81669,7 +81983,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -81685,100 +82000,103 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: v_writelane_b32 v21, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB57_2 ; ; GFX11-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -81786,221 +82104,310 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v16, s32 -; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v18, s32 +; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v20, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v21, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v16, s30, 0 -; GFX11-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-NEXT: v_writelane_b32 v18, s30, 0 +; GFX11-NEXT: v_writelane_b32 v19, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s40, v1 ; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v16, s31, 1 -; GFX11-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-NEXT: v_writelane_b32 v18, s31, 1 +; GFX11-NEXT: v_writelane_b32 v19, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v16, s34, 2 -; GFX11-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-NEXT: v_writelane_b32 v18, s34, 2 +; GFX11-NEXT: v_writelane_b32 v19, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v16, s35, 3 -; GFX11-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-NEXT: v_writelane_b32 v18, s35, 3 +; GFX11-NEXT: v_writelane_b32 v19, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v16, s36, 4 -; GFX11-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-NEXT: v_writelane_b32 v18, s36, 4 +; GFX11-NEXT: v_writelane_b32 v19, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v16, s37, 5 -; GFX11-NEXT: v_writelane_b32 v17, s101, 5 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: v_writelane_b32 v18, s37, 5 +; GFX11-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v16, s38, 6 -; GFX11-NEXT: v_writelane_b32 v17, s102, 6 -; GFX11-NEXT: v_writelane_b32 v16, s39, 7 -; GFX11-NEXT: v_writelane_b32 v17, s103, 7 -; GFX11-NEXT: v_writelane_b32 v16, s48, 8 -; GFX11-NEXT: v_writelane_b32 v17, s104, 8 -; GFX11-NEXT: v_writelane_b32 v16, s49, 9 -; GFX11-NEXT: v_writelane_b32 v16, s50, 10 -; GFX11-NEXT: v_writelane_b32 v16, s51, 11 -; GFX11-NEXT: v_writelane_b32 v16, s52, 12 -; GFX11-NEXT: v_writelane_b32 v16, s53, 13 -; GFX11-NEXT: v_writelane_b32 v16, s54, 14 -; GFX11-NEXT: v_writelane_b32 v16, s55, 15 -; GFX11-NEXT: v_writelane_b32 v16, s64, 16 -; GFX11-NEXT: v_writelane_b32 v16, s65, 17 -; GFX11-NEXT: v_writelane_b32 v16, s66, 18 -; GFX11-NEXT: v_writelane_b32 v16, s67, 19 -; GFX11-NEXT: v_writelane_b32 v16, s68, 20 -; GFX11-NEXT: v_writelane_b32 v16, s69, 21 -; GFX11-NEXT: v_writelane_b32 v16, s70, 22 -; GFX11-NEXT: v_writelane_b32 v16, s71, 23 -; GFX11-NEXT: v_writelane_b32 v16, s80, 24 -; GFX11-NEXT: v_writelane_b32 v16, s81, 25 -; GFX11-NEXT: v_writelane_b32 v16, s82, 26 -; GFX11-NEXT: v_writelane_b32 v16, s83, 27 -; GFX11-NEXT: v_writelane_b32 v16, s84, 28 -; GFX11-NEXT: v_writelane_b32 v16, s85, 29 -; GFX11-NEXT: v_writelane_b32 v16, s86, 30 -; GFX11-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr20 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v18, s38, 6 +; GFX11-NEXT: v_writelane_b32 v19, s102, 6 +; GFX11-NEXT: v_writelane_b32 v18, s39, 7 +; GFX11-NEXT: v_writelane_b32 v19, s103, 7 +; GFX11-NEXT: v_writelane_b32 v18, s48, 8 +; GFX11-NEXT: v_writelane_b32 v19, s104, 8 +; GFX11-NEXT: v_writelane_b32 v18, s49, 9 +; GFX11-NEXT: v_writelane_b32 v18, s50, 10 +; GFX11-NEXT: v_writelane_b32 v18, s51, 11 +; GFX11-NEXT: v_writelane_b32 v18, s52, 12 +; GFX11-NEXT: v_writelane_b32 v18, s53, 13 +; GFX11-NEXT: v_writelane_b32 v18, s54, 14 +; GFX11-NEXT: v_writelane_b32 v18, s55, 15 +; GFX11-NEXT: v_writelane_b32 v18, s64, 16 +; GFX11-NEXT: v_writelane_b32 v18, s65, 17 +; GFX11-NEXT: v_writelane_b32 v18, s66, 18 +; GFX11-NEXT: v_writelane_b32 v18, s67, 19 +; GFX11-NEXT: v_writelane_b32 v18, s68, 20 +; GFX11-NEXT: v_writelane_b32 v18, s69, 21 +; GFX11-NEXT: v_writelane_b32 v18, s70, 22 +; GFX11-NEXT: v_writelane_b32 v18, s71, 23 +; GFX11-NEXT: v_writelane_b32 v18, s80, 24 +; GFX11-NEXT: v_writelane_b32 v18, s81, 25 +; GFX11-NEXT: v_writelane_b32 v18, s82, 26 +; GFX11-NEXT: v_writelane_b32 v18, s83, 27 +; GFX11-NEXT: v_writelane_b32 v18, s84, 28 +; GFX11-NEXT: v_writelane_b32 v18, s85, 29 +; GFX11-NEXT: v_writelane_b32 v18, s86, 30 +; GFX11-NEXT: v_writelane_b32 v18, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB57_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s44, s7, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 2 +; GFX11-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-NEXT: s_lshr_b32 s34, s5, 16 +; GFX11-NEXT: s_lshr_b32 s35, s5, 8 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 3 +; GFX11-NEXT: s_lshr_b32 s44, s7, 8 +; GFX11-NEXT: s_lshr_b32 s43, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s14, 16 +; GFX11-NEXT: s_lshr_b32 s49, s14, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 4 +; GFX11-NEXT: s_lshr_b32 s44, s6, 16 +; GFX11-NEXT: s_lshr_b32 s55, s28, 16 +; GFX11-NEXT: s_lshr_b32 s64, s28, 8 +; GFX11-NEXT: s_lshr_b32 s65, s24, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 5 +; GFX11-NEXT: s_lshr_b32 s44, s6, 8 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s37, s23, 24 +; GFX11-NEXT: s_lshr_b32 s38, s23, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 6 +; GFX11-NEXT: s_lshr_b32 s44, s9, 24 +; GFX11-NEXT: s_lshr_b32 s39, s23, 8 +; GFX11-NEXT: s_lshr_b32 s50, s22, 16 +; GFX11-NEXT: s_lshr_b32 s51, s22, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 7 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s52, s21, 24 +; GFX11-NEXT: s_lshr_b32 s53, s21, 16 +; GFX11-NEXT: s_lshr_b32 s54, s21, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 8 +; GFX11-NEXT: s_lshr_b32 s44, s9, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 9 +; GFX11-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 10 +; GFX11-NEXT: s_lshr_b32 s44, s8, 8 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 11 +; GFX11-NEXT: s_lshr_b32 s44, s11, 24 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 12 +; GFX11-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 13 +; GFX11-NEXT: s_lshr_b32 s44, s11, 8 +; GFX11-NEXT: s_lshr_b32 s98, s2, 16 +; GFX11-NEXT: s_lshr_b32 s99, s2, 8 +; GFX11-NEXT: s_lshr_b32 s100, s1, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 14 +; GFX11-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-NEXT: s_lshr_b32 s101, s1, 16 +; GFX11-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 15 +; GFX11-NEXT: s_lshr_b32 s44, s10, 8 +; GFX11-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 17 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 18 +; GFX11-NEXT: s_lshr_b32 s44, s13, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 19 +; GFX11-NEXT: s_lshr_b32 s44, s12, 16 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 20 +; GFX11-NEXT: s_lshr_b32 s44, s12, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 21 +; GFX11-NEXT: s_lshr_b32 s44, s15, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 22 +; GFX11-NEXT: s_lshr_b32 s44, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 23 +; GFX11-NEXT: s_lshr_b32 s44, s15, 8 +; GFX11-NEXT: v_writelane_b32 v21, s44, 24 +; GFX11-NEXT: s_lshr_b32 s44, s41, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 25 +; GFX11-NEXT: s_lshr_b32 s44, s41, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 26 +; GFX11-NEXT: s_lshr_b32 s44, s41, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 27 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: v_writelane_b32 v21, s44, 28 +; GFX11-NEXT: s_lshr_b32 s44, s40, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 29 +; GFX11-NEXT: s_lshr_b32 s44, s29, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 30 +; GFX11-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s44, 31 +; GFX11-NEXT: s_lshr_b32 s44, s29, 8 +; GFX11-NEXT: v_writelane_b32 v20, s44, 0 +; GFX11-NEXT: s_lshr_b32 s44, s27, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 1 +; GFX11-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-NEXT: v_writelane_b32 v20, s44, 2 +; GFX11-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 3 +; GFX11-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-NEXT: v_writelane_b32 v20, s44, 4 +; GFX11-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 5 +; GFX11-NEXT: s_lshr_b32 s44, s25, 24 +; GFX11-NEXT: v_writelane_b32 v20, s44, 6 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s44, 7 +; GFX11-NEXT: s_lshr_b32 s44, s25, 8 +; GFX11-NEXT: v_writelane_b32 v20, s44, 8 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 0 +; GFX11-NEXT: v_writelane_b32 v21, s45, 1 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: s_mov_b32 vcc_hi, -1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -82010,8 +82417,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -82022,7 +82427,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -82033,8 +82437,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -82045,7 +82447,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -82056,90 +82457,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 ; GFX11-NEXT: .LBB57_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-NEXT: s_mov_b32 vcc_hi, s36 +; GFX11-NEXT: s_mov_b32 s36, s42 +; GFX11-NEXT: s_mov_b32 s42, s43 ; GFX11-NEXT: s_cbranch_vccnz .LBB57_5 ; GFX11-NEXT: ; %bb.4: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 @@ -82174,557 +82498,567 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s7, s7, 0 ; GFX11-NEXT: s_add_u32 s4, s4, 3 ; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-NEXT: s_lshr_b32 s43, s7, 24 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s5, 24 +; GFX11-NEXT: s_lshr_b32 s34, s5, 16 +; GFX11-NEXT: s_lshr_b32 s35, s5, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s7, 8 +; GFX11-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s14, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s49, s14, 8 +; GFX11-NEXT: s_lshr_b32 s55, s28, 16 +; GFX11-NEXT: s_lshr_b32 s64, s28, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s6, 8 +; GFX11-NEXT: s_lshr_b32 s65, s24, 16 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s37, s23, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s9, 24 +; GFX11-NEXT: s_lshr_b32 s38, s23, 16 +; GFX11-NEXT: s_lshr_b32 s39, s23, 8 +; GFX11-NEXT: s_lshr_b32 s50, s22, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-NEXT: s_lshr_b32 s51, s22, 8 +; GFX11-NEXT: s_lshr_b32 s52, s21, 24 +; GFX11-NEXT: s_lshr_b32 s53, s21, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 8 +; GFX11-NEXT: s_lshr_b32 s43, s9, 8 +; GFX11-NEXT: s_lshr_b32 s54, s21, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 9 +; GFX11-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 10 +; GFX11-NEXT: s_lshr_b32 s43, s8, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 11 +; GFX11-NEXT: s_lshr_b32 s43, s11, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 12 +; GFX11-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 13 +; GFX11-NEXT: s_lshr_b32 s43, s11, 8 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: s_lshr_b32 s98, s2, 16 +; GFX11-NEXT: s_lshr_b32 s99, s2, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 14 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s100, s1, 24 +; GFX11-NEXT: s_lshr_b32 s101, s1, 16 +; GFX11-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 15 +; GFX11-NEXT: s_lshr_b32 s43, s10, 8 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 16 +; GFX11-NEXT: s_lshr_b32 s43, s13, 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 17 +; GFX11-NEXT: s_lshr_b32 s43, s13, 16 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 18 +; GFX11-NEXT: s_lshr_b32 s43, s13, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 19 +; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 20 +; GFX11-NEXT: s_lshr_b32 s43, s12, 8 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 21 +; GFX11-NEXT: s_lshr_b32 s43, s15, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s43, 22 +; GFX11-NEXT: s_lshr_b32 s43, s15, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s15, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s43, 24 +; GFX11-NEXT: s_lshr_b32 s43, s41, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 25 +; GFX11-NEXT: s_lshr_b32 s43, s41, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s43, 26 +; GFX11-NEXT: s_lshr_b32 s43, s41, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 27 +; GFX11-NEXT: s_lshr_b32 s43, s40, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s43, 28 +; GFX11-NEXT: s_lshr_b32 s43, s40, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 29 +; GFX11-NEXT: s_lshr_b32 s43, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s43, 30 +; GFX11-NEXT: s_lshr_b32 s43, s29, 16 +; GFX11-NEXT: v_writelane_b32 v21, s43, 31 +; GFX11-NEXT: s_lshr_b32 s43, s29, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 0 +; GFX11-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-NEXT: v_writelane_b32 v21, s44, 0 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 +; GFX11-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-NEXT: v_writelane_b32 v21, s45, 1 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s27, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: v_writelane_b32 v20, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s26, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s25, 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-NEXT: v_writelane_b32 v20, s43, 8 ; GFX11-NEXT: .LBB57_5: ; %end -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-NEXT: s_lshl_b32 s43, s104, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 -; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-NEXT: s_and_b32 s45, s103, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s43 +; GFX11-NEXT: s_lshl_b32 s43, s30, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 -; GFX11-NEXT: s_or_b32 s0, s0, s44 +; GFX11-NEXT: s_or_b32 s43, s45, s43 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_lshl_b32 s45, s100, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s43 +; GFX11-NEXT: s_lshl_b32 s43, s102, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s43 +; GFX11-NEXT: s_and_b32 s43, s101, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_or_b32 s43, s43, s45 +; GFX11-NEXT: s_and_b32 s45, s98, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s43 +; GFX11-NEXT: s_lshl_b32 s43, s99, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s43, s45, s43 +; GFX11-NEXT: s_lshl_b32 s45, s87, 8 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-NEXT: s_lshl_b32 s0, s86, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s97, 8 ; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-NEXT: s_or_b32 s3, s3, s43 +; GFX11-NEXT: s_and_b32 s43, s96, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_or_b32 s43, s43, s45 ; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 ; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s43 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: s_and_b32 s2, s85, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-NEXT: s_lshl_b32 s2, s84, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s82, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-NEXT: s_and_b32 s16, s80, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-NEXT: s_and_b32 s2, s83, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s18, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s17, s69, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-NEXT: s_lshl_b32 s2, s81, 8 +; GFX11-NEXT: s_and_b32 s18, s20, 0xff ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_lshl_b32 s3, s90, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-NEXT: s_lshl_b32 s16, s71, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s19, s67, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-NEXT: s_and_b32 s16, s70, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s88, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_lshl_b32 s17, s68, 8 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_lshl_b32 s18, s78, 8 +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_or_b32 s18, s19, s18 +; GFX11-NEXT: s_and_b32 s19, s28, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: s_or_b32 s17, s17, s18 +; GFX11-NEXT: s_and_b32 s18, s24, 0xff +; GFX11-NEXT: v_mov_b32_e32 v10, s17 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: s_and_b32 s1, s14, 0xff +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_and_b32 s18, s65, 0xff +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_or_b32 s0, s18, s0 +; GFX11-NEXT: s_lshl_b32 s18, s64, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s18, s19, s18 +; GFX11-NEXT: s_or_b32 s0, s17, s0 +; GFX11-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-NEXT: v_mov_b32_e32 v14, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s16, 16 +; GFX11-NEXT: s_and_b32 s19, s55, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s2, s49, 8 +; GFX11-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-NEXT: s_lshl_b32 s0, s74, 8 +; GFX11-NEXT: s_and_b32 s3, s48, 0xff +; GFX11-NEXT: s_or_b32 s17, s19, s17 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s17, s18, s17 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s17 +; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX11-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s54, 8 +; GFX11-NEXT: s_and_b32 s2, s53, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s52, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s51, 8 +; GFX11-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-NEXT: s_and_b32 s14, s50, 0xff +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_or_b32 s2, s14, s3 +; GFX11-NEXT: s_and_b32 s3, s23, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s39, 8 +; GFX11-NEXT: s_and_b32 s16, s38, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s37, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s14 +; GFX11-NEXT: s_or_b32 s14, s16, s17 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s14 +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_mov_b32_e32 v13, s2 +; GFX11-NEXT: v_readlane_b32 s1, v21, 16 +; GFX11-NEXT: v_readlane_b32 s2, v21, 15 +; GFX11-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 -; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_readlane_b32 s2, v20, 7 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_readlane_b32 s1, v20, 8 +; GFX11-NEXT: v_readlane_b32 s3, v20, 6 +; GFX11-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-NEXT: s_and_b32 s0, s25, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s2, v20, 5 +; GFX11-NEXT: v_readlane_b32 s3, v20, 4 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s60, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-NEXT: s_and_b32 s1, s26, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-NEXT: s_or_b32 s2, s3, s10 +; GFX11-NEXT: v_readlane_b32 s10, v20, 3 +; GFX11-NEXT: v_readlane_b32 s14, v20, 2 +; GFX11-NEXT: v_readlane_b32 s16, v20, 1 +; GFX11-NEXT: s_and_b32 s3, s27, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s10 +; GFX11-NEXT: s_or_b32 s10, s14, s16 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s10 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: s_and_b32 s16, s23, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-NEXT: v_readlane_b32 s1, v21, 6 +; GFX11-NEXT: v_readlane_b32 s2, v21, 5 +; GFX11-NEXT: v_mov_b32_e32 v15, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s56, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 -; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 -; GFX11-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readlane_b32 s2, v21, 31 ; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_readlane_b32 s1, v20, 0 +; GFX11-NEXT: v_readlane_b32 s3, v21, 30 +; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11-NEXT: v_mov_b32_e32 v13, s0 +; GFX11-NEXT: s_and_b32 s0, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 -; GFX11-NEXT: s_lshl_b32 s17, s76, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s2, v21, 29 +; GFX11-NEXT: v_readlane_b32 s3, v21, 28 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s58, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_and_b32 s1, s40, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 27 +; GFX11-NEXT: v_readlane_b32 s10, v21, 26 +; GFX11-NEXT: v_readlane_b32 s14, v21, 25 +; GFX11-NEXT: s_and_b32 s3, s41, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s10, s14 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 24 +; GFX11-NEXT: v_readlane_b32 s3, v21, 23 +; GFX11-NEXT: v_readlane_b32 s6, v21, 22 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_and_b32 s0, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_readlane_b32 s2, v21, 21 +; GFX11-NEXT: v_readlane_b32 s3, v21, 20 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s46, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s14 -; GFX11-NEXT: s_and_b32 s12, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 -; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xffff -; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_and_b32 s1, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 19 +; GFX11-NEXT: v_readlane_b32 s10, v21, 18 +; GFX11-NEXT: v_readlane_b32 s12, v21, 17 +; GFX11-NEXT: s_and_b32 s3, s13, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s10, s12 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 14 +; GFX11-NEXT: v_readlane_b32 s3, v21, 13 +; GFX11-NEXT: v_readlane_b32 s6, v21, 12 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_and_b32 s0, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: v_readlane_b32 s2, v21, 11 +; GFX11-NEXT: v_readlane_b32 s3, v21, 10 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s6, s44, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s10 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 -; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s8, s8, 0xffff -; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_and_b32 s1, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: s_and_b32 s3, s9, 0xff +; GFX11-NEXT: v_readlane_b32 s6, v21, 9 +; GFX11-NEXT: v_readlane_b32 s8, v21, 8 +; GFX11-NEXT: v_readlane_b32 s9, v21, 7 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: v_mov_b32_e32 v11, s1 +; GFX11-NEXT: v_readlane_b32 s1, v21, 4 +; GFX11-NEXT: v_readlane_b32 s3, v21, 3 +; GFX11-NEXT: v_readlane_b32 s6, v21, 2 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_and_b32 s0, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-NEXT: s_or_b32 s1, s3, s6 +; GFX11-NEXT: v_readlane_b32 s6, v21, 0 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s6 -; GFX11-NEXT: s_and_b32 s4, s5, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s5 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v17, 8 -; GFX11-NEXT: v_readlane_b32 s103, v17, 7 -; GFX11-NEXT: v_readlane_b32 s102, v17, 6 -; GFX11-NEXT: v_readlane_b32 s101, v17, 5 -; GFX11-NEXT: v_readlane_b32 s96, v17, 0 -; GFX11-NEXT: v_readlane_b32 s87, v16, 31 -; GFX11-NEXT: v_readlane_b32 s85, v16, 29 -; GFX11-NEXT: v_readlane_b32 s84, v16, 28 -; GFX11-NEXT: v_readlane_b32 s83, v16, 27 -; GFX11-NEXT: v_readlane_b32 s82, v16, 26 -; GFX11-NEXT: v_readlane_b32 s81, v16, 25 -; GFX11-NEXT: v_readlane_b32 s80, v16, 24 -; GFX11-NEXT: v_readlane_b32 s71, v16, 23 -; GFX11-NEXT: v_readlane_b32 s70, v16, 22 -; GFX11-NEXT: v_readlane_b32 s68, v16, 20 -; GFX11-NEXT: v_readlane_b32 s67, v16, 19 -; GFX11-NEXT: v_readlane_b32 s66, v16, 18 -; GFX11-NEXT: v_readlane_b32 s65, v16, 17 -; GFX11-NEXT: v_readlane_b32 s64, v16, 16 -; GFX11-NEXT: v_readlane_b32 s55, v16, 15 -; GFX11-NEXT: v_readlane_b32 s54, v16, 14 -; GFX11-NEXT: v_readlane_b32 s53, v16, 13 -; GFX11-NEXT: v_readlane_b32 s52, v16, 12 -; GFX11-NEXT: v_readlane_b32 s51, v16, 11 -; GFX11-NEXT: v_readlane_b32 s50, v16, 10 -; GFX11-NEXT: v_readlane_b32 s49, v16, 9 -; GFX11-NEXT: v_readlane_b32 s48, v16, 8 -; GFX11-NEXT: v_readlane_b32 s39, v16, 7 -; GFX11-NEXT: v_readlane_b32 s38, v16, 6 -; GFX11-NEXT: v_readlane_b32 s37, v16, 5 -; GFX11-NEXT: v_readlane_b32 s36, v16, 4 -; GFX11-NEXT: v_readlane_b32 s35, v16, 3 -; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: s_and_b32 s1, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s42, 8 +; GFX11-NEXT: s_and_b32 s3, s36, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s6, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: s_and_b32 s3, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s35, 8 +; GFX11-NEXT: s_and_b32 s5, s34, 0xff +; GFX11-NEXT: s_lshl_b32 s6, vcc_hi, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s1 +; GFX11-NEXT: v_mov_b32_e32 v16, s2 +; GFX11-NEXT: v_readlane_b32 s7, v21, 1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:112 +; GFX11-NEXT: v_readlane_b32 s104, v19, 8 +; GFX11-NEXT: v_readlane_b32 s103, v19, 7 +; GFX11-NEXT: v_readlane_b32 s102, v19, 6 +; GFX11-NEXT: v_readlane_b32 s101, v19, 5 +; GFX11-NEXT: v_readlane_b32 s100, v19, 4 +; GFX11-NEXT: v_readlane_b32 s99, v19, 3 +; GFX11-NEXT: v_readlane_b32 s98, v19, 2 +; GFX11-NEXT: v_readlane_b32 s97, v19, 1 +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s87, v18, 31 +; GFX11-NEXT: v_readlane_b32 s86, v18, 30 +; GFX11-NEXT: v_readlane_b32 s85, v18, 29 +; GFX11-NEXT: v_readlane_b32 s84, v18, 28 +; GFX11-NEXT: v_readlane_b32 s83, v18, 27 +; GFX11-NEXT: v_readlane_b32 s82, v18, 26 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: v_readlane_b32 s80, v18, 24 +; GFX11-NEXT: v_readlane_b32 s71, v18, 23 +; GFX11-NEXT: v_readlane_b32 s70, v18, 22 +; GFX11-NEXT: v_readlane_b32 s69, v18, 21 +; GFX11-NEXT: v_readlane_b32 s68, v18, 20 +; GFX11-NEXT: v_readlane_b32 s67, v18, 19 +; GFX11-NEXT: v_readlane_b32 s66, v18, 18 +; GFX11-NEXT: v_readlane_b32 s65, v18, 17 +; GFX11-NEXT: v_readlane_b32 s64, v18, 16 +; GFX11-NEXT: v_readlane_b32 s55, v18, 15 +; GFX11-NEXT: v_readlane_b32 s54, v18, 14 +; GFX11-NEXT: v_readlane_b32 s53, v18, 13 +; GFX11-NEXT: v_readlane_b32 s52, v18, 12 +; GFX11-NEXT: v_readlane_b32 s51, v18, 11 +; GFX11-NEXT: v_readlane_b32 s50, v18, 10 +; GFX11-NEXT: v_readlane_b32 s49, v18, 9 +; GFX11-NEXT: v_readlane_b32 s48, v18, 8 +; GFX11-NEXT: v_readlane_b32 s39, v18, 7 +; GFX11-NEXT: v_readlane_b32 s38, v18, 6 +; GFX11-NEXT: v_readlane_b32 s37, v18, 5 +; GFX11-NEXT: v_readlane_b32 s36, v18, 4 +; GFX11-NEXT: v_readlane_b32 s35, v18, 3 +; GFX11-NEXT: v_readlane_b32 s34, v18, 2 +; GFX11-NEXT: v_readlane_b32 s31, v18, 1 +; GFX11-NEXT: v_readlane_b32 s30, v18, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v16, off, s32 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v18, off, s32 +; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -91984,10 +92318,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248 @@ -92020,7 +92354,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124 @@ -92081,25 +92415,25 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) @@ -92140,253 +92474,262 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff @@ -92414,18 +92757,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_3 @@ -92604,7 +92937,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -92678,9 +93011,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -92714,13 +93047,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -92925,10 +93258,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:232 ; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 @@ -92961,7 +93294,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 ; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:140 ; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 @@ -93022,25 +93355,25 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) @@ -93081,253 +93414,262 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff @@ -93355,18 +93697,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_3 @@ -93545,7 +93877,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -93619,9 +93951,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -93655,13 +93987,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -95315,8 +95647,8 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -95394,8 +95726,8 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -96272,9 +96604,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96290,9 +96622,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96308,9 +96640,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96326,9 +96658,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96344,9 +96676,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96362,9 +96694,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96380,9 +96712,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96398,9 +96730,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96416,9 +96748,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96434,9 +96766,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96452,9 +96784,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96470,9 +96802,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96488,9 +96820,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96506,9 +96838,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96524,9 +96856,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96542,10 +96874,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96561,9 +96893,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96579,9 +96911,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96597,9 +96929,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96615,9 +96947,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96633,9 +96965,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96651,9 +96983,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96669,9 +97001,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96687,9 +97019,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96705,9 +97037,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96723,9 +97055,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96741,9 +97073,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96759,9 +97091,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96777,9 +97109,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96795,9 +97127,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96813,9 +97145,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -96865,9 +97197,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96880,9 +97212,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96895,9 +97227,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96910,9 +97242,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96925,9 +97257,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96940,9 +97272,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96955,9 +97287,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96970,9 +97302,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -96985,9 +97317,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97000,9 +97332,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97015,9 +97347,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97030,9 +97362,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97045,9 +97377,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97060,9 +97392,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97075,9 +97407,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97090,10 +97422,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97106,9 +97438,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97121,9 +97453,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97136,9 +97468,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97151,9 +97483,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97166,9 +97498,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97181,9 +97513,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97196,9 +97528,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97211,9 +97543,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97226,9 +97558,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97241,9 +97573,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97256,9 +97588,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97271,9 +97603,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97286,9 +97618,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97301,9 +97633,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -97316,9 +97648,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -99085,8 +99417,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -99119,9 +99451,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99137,9 +99469,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99155,9 +99487,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99173,9 +99505,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99191,9 +99523,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99209,9 +99541,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99227,9 +99559,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99245,9 +99577,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99263,9 +99595,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99281,9 +99613,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99299,9 +99631,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99317,9 +99649,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99335,9 +99667,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99353,9 +99685,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99371,9 +99703,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99389,9 +99721,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 ; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99407,9 +99739,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99425,9 +99757,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99443,9 +99775,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99461,9 +99793,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99479,9 +99811,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99497,9 +99829,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99515,9 +99847,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99533,9 +99865,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99551,9 +99883,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99569,9 +99901,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99587,9 +99919,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99605,9 +99937,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99623,9 +99955,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 ; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99641,9 +99973,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99659,9 +99991,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -99708,8 +100040,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -100345,864 +100677,1020 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB63_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB63_3 ; GFX11-NEXT: .LBB63_2: ; %cmp.true -; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s4, s27, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 -; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-NEXT: s_and_b32 s5, s24, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_lshl_b32 s4, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v30, v0, 16, v1 +; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v3 :: v_dual_add_nc_u32 v3, v6, v1 +; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v11 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v13, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v9, v8 ; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v12, v10 ; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v15 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v10, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v16 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_bfe_u32 v12, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v124, v3, 16, v5 +; GFX11-NEXT: v_lshl_or_b32 v112, v6, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v15 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v11, v12, vcc_lo +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshl_or_b32 v101, v8, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v137, v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX11-NEXT: v_lshl_or_b32 v91, v9, 16, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo +; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v82, v11, 16, v13 +; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v16 ; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-NEXT: v_lshl_or_b32 v74, v15, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v19 +; GFX11-NEXT: v_lshl_or_b32 v67, v17, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v21, v21, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v22, v22, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-NEXT: v_lshl_or_b32 v61, v19, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v23 +; GFX11-NEXT: v_lshl_or_b32 v56, v21, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s2 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v25, v25, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s1 ; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_lshl_or_b32 v52, v23, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v27, v27, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v27 +; GFX11-NEXT: v_lshl_or_b32 v49, v25, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v29, v29, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v29 +; GFX11-NEXT: v_dual_cndmask_b32 v28, v29, v31 :: v_dual_lshlrev_b32 v29, 16, v176 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v47, v27, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v176, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v177 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v177 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v177, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v178 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v179 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v179 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v179, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v180 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v180 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v181 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v181 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-NEXT: v_lshl_or_b32 v181, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v182 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v183 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-NEXT: v_lshl_or_b32 v183, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v170 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v170 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_lshl_or_b32 v170, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v171 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v171 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v171, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v172 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v172 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v172, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v173 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v173 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v173, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v174 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v174 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v174, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v175 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v175 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v175, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v185 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v185 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v185, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v184 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v184 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v184, v31, 16, v29 ; GFX11-NEXT: .LBB63_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB63_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 ; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 ; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 ; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 ; GFX11-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -102828,8 +103316,8 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -102907,8 +103395,8 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -104742,8 +105230,8 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -104886,8 +105374,8 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -104947,107 +105435,109 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 ; GFX11-NEXT: .LBB67_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] @@ -105056,142 +105546,142 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v138, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v126, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v115, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v105, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v96, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v88, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v81, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v75, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v70, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v66, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v63, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB67_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB67_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB67_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -105802,9 +106292,10 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v18 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v3, s8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v2, s11 ; SI-NEXT: v_mov_b32_e32 v4, s13 ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_mov_b32_e32 v6, s41 @@ -105819,8 +106310,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, s18 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v2, s10, v2, 16 ; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 ; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 ; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 @@ -105883,10 +106373,11 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_add_u32 s9, s9, 3 ; SI-NEXT: s_addc_u32 s8, s8, 0 ; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v2, s9 ; SI-NEXT: s_addc_u32 s6, s6, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_alignbit_b32 v3, s8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v2, s11 ; SI-NEXT: v_mov_b32_e32 v4, s13 ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_mov_b32_e32 v6, s41 @@ -105901,8 +106392,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, s18 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v2, s10, v2, 16 ; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 ; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 ; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 @@ -106086,25 +106576,25 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_and_b32 s4, s10, 0xffff ; SI-NEXT: s_lshl_b32 s5, s58, 16 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_and_b32 s4, s8, 0xffff ; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -106128,37 +106618,37 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: @@ -106185,8 +106675,8 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -106264,8 +106754,8 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -107922,8 +108412,8 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -107982,107 +108472,109 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 ; GFX11-NEXT: .LBB71_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] @@ -108091,142 +108583,142 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v138, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v126, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v115, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v105, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v96, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v88, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v81, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v75, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v70, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v66, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v63, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB71_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB71_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB71_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -111911,32 +112403,30 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -112031,26 +112521,28 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -112132,9 +112624,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: .LBB72_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -112252,48 +112743,48 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB72_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v65, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 @@ -112555,27 +113046,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -112733,19 +113223,19 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill @@ -112811,19 +113301,19 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 @@ -112974,19 +113464,19 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill @@ -113052,22 +113542,22 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 @@ -113181,11 +113671,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 @@ -113209,12 +113699,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 @@ -113315,11 +113805,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -113348,14 +113838,14 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -116366,383 +116856,380 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:92 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v76, s30, 0 -; GFX11-NEXT: v_writelane_b32 v77, s96, 0 +; GFX11-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-NEXT: v_writelane_b32 v75, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-NEXT: v_readfirstlane_b32 s5, v2 -; GFX11-NEXT: v_writelane_b32 v76, s31, 1 -; GFX11-NEXT: v_writelane_b32 v77, s97, 1 +; GFX11-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-NEXT: v_writelane_b32 v75, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v3 ; GFX11-NEXT: v_readfirstlane_b32 s7, v4 ; GFX11-NEXT: v_readfirstlane_b32 s8, v5 -; GFX11-NEXT: v_writelane_b32 v76, s34, 2 -; GFX11-NEXT: v_writelane_b32 v77, s98, 2 +; GFX11-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-NEXT: v_writelane_b32 v75, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s9, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v76, s35, 3 -; GFX11-NEXT: v_writelane_b32 v77, s99, 3 +; GFX11-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-NEXT: v_writelane_b32 v75, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s12, v9 ; GFX11-NEXT: v_readfirstlane_b32 s13, v10 ; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: v_writelane_b32 v76, s36, 4 -; GFX11-NEXT: v_writelane_b32 v77, s100, 4 +; GFX11-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-NEXT: v_writelane_b32 v75, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s15, v12 ; GFX11-NEXT: v_readfirstlane_b32 s40, v13 ; GFX11-NEXT: v_readfirstlane_b32 s41, v14 -; GFX11-NEXT: v_writelane_b32 v76, s37, 5 -; GFX11-NEXT: v_writelane_b32 v77, s101, 5 -; GFX11-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-NEXT: v_writelane_b32 v75, s101, 5 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x13 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 -; GFX11-NEXT: v_writelane_b32 v76, s38, 6 -; GFX11-NEXT: v_writelane_b32 v77, s102, 6 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr79 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v76, s39, 7 -; GFX11-NEXT: v_writelane_b32 v77, s103, 7 -; GFX11-NEXT: v_writelane_b32 v76, s48, 8 -; GFX11-NEXT: v_writelane_b32 v77, s104, 8 -; GFX11-NEXT: v_writelane_b32 v76, s49, 9 -; GFX11-NEXT: v_writelane_b32 v76, s50, 10 -; GFX11-NEXT: v_writelane_b32 v76, s51, 11 -; GFX11-NEXT: v_writelane_b32 v76, s52, 12 -; GFX11-NEXT: v_writelane_b32 v76, s53, 13 -; GFX11-NEXT: v_writelane_b32 v76, s54, 14 -; GFX11-NEXT: v_writelane_b32 v76, s55, 15 -; GFX11-NEXT: v_writelane_b32 v76, s64, 16 -; GFX11-NEXT: v_writelane_b32 v76, s65, 17 -; GFX11-NEXT: v_writelane_b32 v76, s66, 18 -; GFX11-NEXT: v_writelane_b32 v76, s67, 19 -; GFX11-NEXT: v_writelane_b32 v76, s68, 20 -; GFX11-NEXT: v_writelane_b32 v76, s69, 21 -; GFX11-NEXT: v_writelane_b32 v76, s70, 22 -; GFX11-NEXT: v_writelane_b32 v76, s71, 23 -; GFX11-NEXT: v_writelane_b32 v76, s80, 24 -; GFX11-NEXT: v_writelane_b32 v76, s81, 25 -; GFX11-NEXT: v_writelane_b32 v76, s82, 26 -; GFX11-NEXT: v_writelane_b32 v76, s83, 27 -; GFX11-NEXT: v_writelane_b32 v76, s84, 28 -; GFX11-NEXT: v_writelane_b32 v76, s85, 29 -; GFX11-NEXT: v_writelane_b32 v76, s86, 30 -; GFX11-NEXT: v_writelane_b32 v76, s87, 31 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-NEXT: s_mov_b32 s103, 0 +; GFX11-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-NEXT: v_writelane_b32 v74, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: s_lshr_b32 s48, s41, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s50, s41, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s13, 8 -; GFX11-NEXT: s_lshr_b32 s49, s41, 16 -; GFX11-NEXT: s_lshr_b32 s48, s41, 8 -; GFX11-NEXT: s_lshr_b32 s52, s40, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-NEXT: s_lshr_b32 s39, s41, 16 +; GFX11-NEXT: s_lshr_b32 s38, s41, 8 +; GFX11-NEXT: s_lshr_b32 s50, s40, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s11, 24 -; GFX11-NEXT: s_lshr_b32 s51, s40, 8 -; GFX11-NEXT: s_lshr_b32 s39, s15, 24 -; GFX11-NEXT: s_lshr_b32 s38, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: s_lshr_b32 s49, s40, 8 +; GFX11-NEXT: s_lshr_b32 s37, s15, 24 +; GFX11-NEXT: s_lshr_b32 s36, s15, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s11, 16 -; GFX11-NEXT: s_lshr_b32 s37, s15, 8 -; GFX11-NEXT: s_lshr_b32 s54, s14, 16 -; GFX11-NEXT: s_lshr_b32 s53, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: s_lshr_b32 s35, s15, 8 +; GFX11-NEXT: s_lshr_b32 s52, s14, 16 +; GFX11-NEXT: s_lshr_b32 s51, s14, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s11, 8 -; GFX11-NEXT: s_lshr_b32 s36, s13, 24 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: s_lshr_b32 s55, s12, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: s_lshr_b32 s104, s13, 8 +; GFX11-NEXT: s_lshr_b32 s54, s12, 16 +; GFX11-NEXT: s_lshr_b32 s53, s12, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s9, 24 -; GFX11-NEXT: s_lshr_b32 s66, s10, 16 -; GFX11-NEXT: s_lshr_b32 s65, s10, 8 -; GFX11-NEXT: s_lshr_b32 s68, s8, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: s_lshr_b32 s64, s10, 16 +; GFX11-NEXT: s_lshr_b32 s55, s10, 8 +; GFX11-NEXT: s_lshr_b32 s66, s8, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s67, s8, 8 -; GFX11-NEXT: s_lshr_b32 s70, s6, 16 -; GFX11-NEXT: s_lshr_b32 s69, s6, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-NEXT: s_lshr_b32 s68, s6, 16 +; GFX11-NEXT: s_lshr_b32 s67, s6, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s9, 8 -; GFX11-NEXT: s_lshr_b32 s80, s4, 16 -; GFX11-NEXT: s_lshr_b32 s71, s4, 8 -; GFX11-NEXT: s_lshr_b32 s82, s28, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: s_lshr_b32 s70, s4, 16 +; GFX11-NEXT: s_lshr_b32 s69, s4, 8 +; GFX11-NEXT: s_lshr_b32 s80, s28, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s7, 24 -; GFX11-NEXT: s_lshr_b32 s81, s28, 8 -; GFX11-NEXT: s_lshr_b32 s84, s26, 16 -; GFX11-NEXT: s_lshr_b32 s83, s26, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: s_lshr_b32 s71, s28, 8 +; GFX11-NEXT: s_lshr_b32 s82, s26, 16 +; GFX11-NEXT: s_lshr_b32 s81, s26, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s7, 16 -; GFX11-NEXT: s_lshr_b32 s86, s24, 16 -; GFX11-NEXT: s_lshr_b32 s85, s24, 8 -; GFX11-NEXT: s_lshr_b32 s96, s22, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-NEXT: s_lshr_b32 s84, s24, 16 +; GFX11-NEXT: s_lshr_b32 s83, s24, 8 +; GFX11-NEXT: s_lshr_b32 s86, s22, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s7, 8 -; GFX11-NEXT: s_lshr_b32 s87, s22, 8 -; GFX11-NEXT: s_lshr_b32 s98, s20, 16 -; GFX11-NEXT: s_lshr_b32 s97, s20, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-NEXT: s_lshr_b32 s85, s22, 8 +; GFX11-NEXT: s_lshr_b32 s96, s20, 16 +; GFX11-NEXT: s_lshr_b32 s87, s20, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s5, 24 -; GFX11-NEXT: s_lshr_b32 s100, s18, 16 -; GFX11-NEXT: s_lshr_b32 s99, s18, 8 -; GFX11-NEXT: s_lshr_b32 s102, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-NEXT: s_lshr_b32 s98, s18, 16 +; GFX11-NEXT: s_lshr_b32 s97, s18, 8 +; GFX11-NEXT: s_lshr_b32 s100, s16, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s101, s16, 8 -; GFX11-NEXT: s_lshr_b32 s104, s2, 16 -; GFX11-NEXT: s_lshr_b32 s103, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-NEXT: s_lshr_b32 s99, s16, 8 +; GFX11-NEXT: s_lshr_b32 s102, s2, 16 +; GFX11-NEXT: s_lshr_b32 s101, s2, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s5, 8 -; GFX11-NEXT: s_lshr_b32 s35, s0, 16 -; GFX11-NEXT: s_lshr_b32 s34, s0, 8 +; GFX11-NEXT: s_lshr_b32 s34, s0, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-NEXT: v_writelane_b32 v76, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s29, 24 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-NEXT: v_writelane_b32 v76, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s29, 16 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-NEXT: v_writelane_b32 v76, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s29, 8 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s27, 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-NEXT: v_writelane_b32 v76, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-NEXT: v_writelane_b32 v76, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s27, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-NEXT: v_writelane_b32 v76, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s25, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-NEXT: v_writelane_b32 v76, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-NEXT: v_writelane_b32 v76, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-NEXT: v_writelane_b32 v76, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-NEXT: v_writelane_b32 v76, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-NEXT: v_writelane_b32 v76, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-NEXT: v_writelane_b32 v76, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-NEXT: v_writelane_b32 v76, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: v_writelane_b32 v76, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v76, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v76, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-NEXT: v_writelane_b32 v76, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v79, s42, 0 +; GFX11-NEXT: v_writelane_b32 v77, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 1 +; GFX11-NEXT: v_writelane_b32 v77, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v79, s42, 2 +; GFX11-NEXT: v_writelane_b32 v77, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 3 +; GFX11-NEXT: v_writelane_b32 v77, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v79, s42, 4 +; GFX11-NEXT: v_writelane_b32 v77, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 5 +; GFX11-NEXT: v_writelane_b32 v77, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 -; GFX11-NEXT: v_writelane_b32 v79, s42, 6 +; GFX11-NEXT: v_writelane_b32 v77, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 7 +; GFX11-NEXT: v_writelane_b32 v77, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 -; GFX11-NEXT: v_writelane_b32 v79, s42, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 8 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s103 ; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 ; GFX11-NEXT: .LBB73_2: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[32:33], s[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[5:6], s[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[52:53], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[17:18], s[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[29:30], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[33:34], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[64:65], s[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[1:2], s[40:41], 1.0 ; GFX11-NEXT: v_add_f64 v[3:4], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[5:6], s[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[9:10], s[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[11:12], s[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[13:14], s[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[64:65], s[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[23:24] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] -; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] -; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] -; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[37:38], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[66:67], s[0:1], 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[33:34] ; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[64:65] -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v8 +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[37:38] +; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[66:67] +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v8 ; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v8 ; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v64 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 24, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 8, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v65 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v65 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v65 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v66 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v66 ; GFX11-NEXT: s_branch .LBB73_5 ; GFX11-NEXT: .LBB73_3: ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 +; GFX11-NEXT: s_mov_b32 s103, -1 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 +; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr35 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr101 ; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr99 ; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr87 ; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr58 ; GFX11-NEXT: ; implicit-def: $sgpr85 ; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr60 ; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr81 @@ -116757,17 +117244,19 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr36 ; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr92 @@ -116858,524 +117347,519 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: s_branch .LBB73_2 ; GFX11-NEXT: .LBB73_4: -; GFX11-NEXT: v_dual_mov_b32 v64, s0 :: v_dual_mov_b32 v65, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-NEXT: v_dual_mov_b32 v66, s0 :: v_dual_mov_b32 v67, s1 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 ; GFX11-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v4, s15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v87, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 1 -; GFX11-NEXT: v_mov_b32_e32 v39, s54 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v84, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 1 ; GFX11-NEXT: v_dual_mov_b32 v5, s12 :: v_dual_mov_b32 v6, s13 ; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 -; GFX11-NEXT: v_mov_b32_e32 v96, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-NEXT: v_mov_b32_e32 v85, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 ; GFX11-NEXT: v_dual_mov_b32 v9, s8 :: v_dual_mov_b32 v10, s9 ; GFX11-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v12, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v99, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 -; GFX11-NEXT: v_mov_b32_e32 v55, s53 -; GFX11-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s5 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_mov_b32_e32 v100, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 -; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v98, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-NEXT: v_dual_mov_b32 v86, s104 :: v_dual_mov_b32 v13, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s28 +; GFX11-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v99, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v26, s25 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v101, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 -; GFX11-NEXT: v_dual_mov_b32 v27, s51 :: v_dual_mov_b32 v28, s22 -; GFX11-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v32, s20 -; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v112, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v31, s49 :: v_dual_mov_b32 v36, s18 -; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v48, s16 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 +; GFX11-NEXT: v_dual_mov_b32 v29, s22 :: v_dual_mov_b32 v30, s23 +; GFX11-NEXT: v_dual_mov_b32 v33, s20 :: v_dual_mov_b32 v34, s21 +; GFX11-NEXT: v_mov_b32_e32 v112, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-NEXT: v_dual_mov_b32 v37, s18 :: v_dual_mov_b32 v38, s19 +; GFX11-NEXT: v_dual_mov_b32 v50, s16 :: v_dual_mov_b32 v51, s17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v113, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: v_dual_mov_b32 v64, s2 :: v_dual_mov_b32 v65, s3 +; GFX11-NEXT: v_dual_mov_b32 v47, s34 :: v_dual_mov_b32 v56, vcc_hi +; GFX11-NEXT: v_mov_b32_e32 v114, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_dual_mov_b32 v42, s102 :: v_dual_mov_b32 v43, s101 +; GFX11-NEXT: v_dual_mov_b32 v181, s100 :: v_dual_mov_b32 v40, s99 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v117, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-NEXT: v_dual_mov_b32 v176, s98 :: v_dual_mov_b32 v179, s97 +; GFX11-NEXT: v_dual_mov_b32 v166, s96 :: v_dual_mov_b32 v167, s87 +; GFX11-NEXT: v_mov_b32_e32 v118, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-NEXT: v_dual_mov_b32 v161, s86 :: v_dual_mov_b32 v162, s85 +; GFX11-NEXT: v_dual_mov_b32 v148, s84 :: v_dual_mov_b32 v149, s83 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v114, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s48 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v44, s35 -; GFX11-NEXT: v_dual_mov_b32 v41, s104 :: v_dual_mov_b32 v116, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 -; GFX11-NEXT: v_dual_mov_b32 v46, s34 :: v_dual_mov_b32 v43, s103 -; GFX11-NEXT: v_dual_mov_b32 v181, s102 :: v_dual_mov_b32 v182, s101 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v119, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v51, s39 :: v_dual_mov_b32 v176, s100 -; GFX11-NEXT: v_mov_b32_e32 v177, s99 -; GFX11-NEXT: v_dual_mov_b32 v163, s98 :: v_dual_mov_b32 v160, s96 -; GFX11-NEXT: v_mov_b32_e32 v128, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v165, s97 :: v_dual_mov_b32 v148, s86 -; GFX11-NEXT: v_dual_mov_b32 v161, s87 :: v_dual_mov_b32 v144, s83 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v129, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_mov_b32_e32 v71, s38 -; GFX11-NEXT: v_dual_mov_b32 v149, s85 :: v_dual_mov_b32 v130, s82 -; GFX11-NEXT: v_dual_mov_b32 v135, s84 :: v_dual_mov_b32 v118, s71 -; GFX11-NEXT: v_mov_b32_e32 v132, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v131, s81 :: v_dual_mov_b32 v102, s68 -; GFX11-NEXT: v_dual_mov_b32 v117, s80 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v133, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_mov_b32_e32 v83, s37 -; GFX11-NEXT: v_dual_mov_b32 v113, s70 :: v_dual_mov_b32 v84, s64 -; GFX11-NEXT: v_dual_mov_b32 v115, s69 :: v_dual_mov_b32 v86, s55 +; GFX11-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-NEXT: v_dual_mov_b32 v135, s82 :: v_dual_mov_b32 v132, s80 +; GFX11-NEXT: v_dual_mov_b32 v147, s81 :: v_dual_mov_b32 v128, s70 +; GFX11-NEXT: v_mov_b32_e32 v130, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-NEXT: v_dual_mov_b32 v133, s71 :: v_dual_mov_b32 v116, s67 +; GFX11-NEXT: v_dual_mov_b32 v129, s69 :: v_dual_mov_b32 v102, s66 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v131, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-NEXT: v_dual_mov_b32 v115, s68 :: v_dual_mov_b32 v100, s55 +; GFX11-NEXT: v_dual_mov_b32 v103, s65 :: v_dual_mov_b32 v96, s53 ; GFX11-NEXT: v_mov_b32_e32 v134, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v103, s67 :: v_dual_mov_b32 v18, s52 -; GFX11-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v22, s50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v28, s50 +; GFX11-NEXT: v_dual_mov_b32 v87, s54 :: v_dual_mov_b32 v32, s49 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v144, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-NEXT: v_dual_mov_b32 v49, s52 :: v_dual_mov_b32 v16, s48 +; GFX11-NEXT: v_dual_mov_b32 v71, s51 :: v_dual_mov_b32 v20, s39 ; GFX11-NEXT: v_mov_b32_e32 v145, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_mov_b32_e32 v85, s36 -; GFX11-NEXT: v_dual_mov_b32 v81, s42 :: v_dual_mov_b32 v38, s90 -; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s88 +; GFX11-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-NEXT: v_dual_mov_b32 v24, s38 :: v_dual_mov_b32 v39, s36 +; GFX11-NEXT: v_dual_mov_b32 v36, s37 :: v_dual_mov_b32 v55, s35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v146, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s78 -; GFX11-NEXT: v_dual_mov_b32 v26, s76 :: v_dual_mov_b32 v25, s74 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v147, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_mov_b32_e32 v21, s72 -; GFX11-NEXT: v_dual_mov_b32 v17, s62 :: v_dual_mov_b32 v80, s44 -; GFX11-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-NEXT: v_dual_mov_b32 v82, s42 :: v_dual_mov_b32 v53, s94 +; GFX11-NEXT: v_dual_mov_b32 v80, s46 :: v_dual_mov_b32 v35, s88 ; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_mov_b32_e32 v68, s58 -; GFX11-NEXT: v_mov_b32_e32 v66, s30 -; GFX11-NEXT: v_mov_b32_e32 v54, s94 +; GFX11-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v31, s78 +; GFX11-NEXT: v_dual_mov_b32 v68, s60 :: v_dual_mov_b32 v27, s76 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-NEXT: v_dual_mov_b32 v54, s30 :: v_dual_mov_b32 v23, s74 +; GFX11-NEXT: v_dual_mov_b32 v52, s92 :: v_dual_mov_b32 v19, s72 +; GFX11-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-NEXT: v_dual_mov_b32 v48, s90 :: v_dual_mov_b32 v15, s62 +; GFX11-NEXT: v_mov_b32_e32 v81, s44 +; GFX11-NEXT: v_mov_b32_e32 v69, s58 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v50, s92 :: v_dual_mov_b32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v163, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 21 ; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-NEXT: v_readlane_b32 s0, v76, 22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 23 +; GFX11-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 25 ; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-NEXT: v_readlane_b32 s0, v76, 26 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v182, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 27 ; GFX11-NEXT: v_mov_b32_e32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 -; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-NEXT: v_readlane_b32 s0, v76, 28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 -; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-NEXT: v_mov_b32_e32 v41, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 29 +; GFX11-NEXT: v_mov_b32_e32 v44, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 -; GFX11-NEXT: v_mov_b32_e32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 0 +; GFX11-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 31 +; GFX11-NEXT: v_mov_b32_e32 v46, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 1 ; GFX11-NEXT: v_mov_b32_e32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 2 +; GFX11-NEXT: v_readlane_b32 s0, v77, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 3 +; GFX11-NEXT: v_readlane_b32 s0, v77, 3 ; GFX11-NEXT: v_mov_b32_e32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 4 +; GFX11-NEXT: v_readlane_b32 s0, v77, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 5 +; GFX11-NEXT: v_readlane_b32 s0, v77, 5 ; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 6 +; GFX11-NEXT: v_readlane_b32 s0, v77, 6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 7 +; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 8 +; GFX11-NEXT: v_readlane_b32 s0, v77, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v73, s0 ; GFX11-NEXT: .LBB73_5: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v56 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v82 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v56, 0xff, v64 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v65 +; GFX11-NEXT: v_or_b32_e32 v66, v66, v83 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v47 +; GFX11-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v181, 0xff, v181 +; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v66 +; GFX11-NEXT: v_or_b32_e32 v82, v83, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v73 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: v_or_b32_e32 v64, v64, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v63 -; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v82 +; GFX11-NEXT: v_or_b32_e32 v67, v67, v83 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v43 +; GFX11-NEXT: v_and_b32_e32 v43, 0xff, v61 +; GFX11-NEXT: v_or_b32_e32 v64, v64, v66 +; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-NEXT: v_or_b32_e32 v83, v83, v47 +; GFX11-NEXT: v_or_b32_e32 v66, v56, v82 +; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v42 +; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v62 +; GFX11-NEXT: v_lshlrev_b32_e32 v47, 8, v60 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v83 +; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 ; GFX11-NEXT: v_or_b32_e32 v81, v82, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v73 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v43 -; GFX11-NEXT: v_or_b32_e32 v80, v41, v80 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v65, v65, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v72 -; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v72, v64, v81 -; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v65 -; GFX11-NEXT: v_or_b32_e32 v82, v82, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v62 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v65 +; GFX11-NEXT: v_or_b32_e32 v80, v181, v80 +; GFX11-NEXT: v_or_b32_e32 v65, v67, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v81, v82, v42 +; GFX11-NEXT: v_or_b32_e32 v82, v43, v47 +; GFX11-NEXT: v_lshlrev_b32_e32 v181, 8, v57 +; GFX11-NEXT: v_or_b32_e32 v50, v50, v83 +; GFX11-NEXT: v_or_b32_e32 v66, v66, v67 +; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v59 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v58 +; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v67, v67, v81 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v82 +; GFX11-NEXT: v_or_b32_e32 v81, v83, v181 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v176 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v61 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v81 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v181 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-NEXT: v_or_b32_e32 v73, v64, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v80 -; GFX11-NEXT: v_or_b32_e32 v65, v82, v41 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v182 +; GFX11-NEXT: v_or_b32_e32 v80, v50, v80 +; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v37, v37, v82 +; GFX11-NEXT: v_or_b32_e32 v70, v83, v70 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v46 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v45 +; GFX11-NEXT: v_lshlrev_b32_e32 v176, 8, v44 +; GFX11-NEXT: v_or_b32_e32 v81, v50, v51 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GFX11-NEXT: v_or_b32_e32 v74, v52, v64 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v65 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v80 -; GFX11-NEXT: v_or_b32_e32 v64, v81, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v57 -; GFX11-NEXT: v_or_b32_e32 v75, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v64 -; GFX11-NEXT: v_or_b32_e32 v49, v49, v65 -; GFX11-NEXT: v_or_b32_e32 v53, v70, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v176 -; GFX11-NEXT: v_or_b32_e32 v43, v48, v52 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; GFX11-NEXT: v_or_b32_e32 v36, v36, v64 -; GFX11-NEXT: v_or_b32_e32 v52, v65, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v47 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v44, v48, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 -; GFX11-NEXT: v_or_b32_e32 v37, v37, v53 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-NEXT: v_or_b32_e32 v49, v64, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v165 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v42 +; GFX11-NEXT: v_or_b32_e32 v38, v38, v82 +; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v166 +; GFX11-NEXT: v_or_b32_e32 v51, v83, v176 ; GFX11-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v52 -; GFX11-NEXT: v_or_b32_e32 v52, v53, v64 -; GFX11-NEXT: v_or_b32_e32 v33, v33, v65 -; GFX11-NEXT: v_or_b32_e32 v45, v36, v48 -; GFX11-NEXT: v_or_b32_e32 v46, v37, v49 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v40 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v183 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v161 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v167 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v41 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-NEXT: v_or_b32_e32 v69, v82, v69 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v70 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v83 +; GFX11-NEXT: v_or_b32_e32 v82, v37, v50 +; GFX11-NEXT: v_or_b32_e32 v83, v38, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v69 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v183 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v182 ; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v179 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v178 -; GFX11-NEXT: v_or_b32_e32 v37, v37, v48 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v49 -; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 -; GFX11-NEXT: v_or_b32_e32 v29, v29, v64 -; GFX11-NEXT: v_or_b32_e32 v49, v65, v67 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v162 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v161 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v180 +; GFX11-NEXT: v_and_b32_e32 v161, 0xff, v178 +; GFX11-NEXT: v_lshlrev_b32_e32 v162, 8, v177 +; GFX11-NEXT: v_or_b32_e32 v38, v38, v50 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v51 +; GFX11-NEXT: v_or_b32_e32 v50, v69, v68 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v70 +; GFX11-NEXT: v_or_b32_e32 v51, v161, v162 ; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v38 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v67, v32, v36 -; GFX11-NEXT: v_or_b32_e32 v68, v33, v37 -; GFX11-NEXT: v_or_b32_e32 v69, v28, v48 -; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-NEXT: v_or_b32_e32 v70, v29, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v66 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v166 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v144 -; GFX11-NEXT: v_or_b32_e32 v23, v23, v28 -; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 -; GFX11-NEXT: v_or_b32_e32 v24, v24, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v36, v37 -; GFX11-NEXT: v_or_b32_e32 v19, v19, v48 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off +; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v64, v33, v37 +; GFX11-NEXT: v_or_b32_e32 v65, v34, v38 +; GFX11-NEXT: v_or_b32_e32 v66, v29, v50 +; GFX11-NEXT: v_or_b32_e32 v67, v30, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v149 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v148 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v151 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v150 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v131 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v36 -; GFX11-NEXT: v_or_b32_e32 v33, v37, v48 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v49 -; GFX11-NEXT: v_or_b32_e32 v36, v52, v50 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v147 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v34 +; GFX11-NEXT: v_or_b32_e32 v30, v37, v38 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v53 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v133 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v37 +; GFX11-NEXT: v_or_b32_e32 v34, v38, v50 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v51 +; GFX11-NEXT: v_or_b32_e32 v37, v53, v52 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v148, v23, v28 -; GFX11-NEXT: v_or_b32_e32 v150, v19, v32 -; GFX11-NEXT: v_or_b32_e32 v151, v20, v33 -; GFX11-NEXT: v_or_b32_e32 v130, v15, v36 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v147 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v146 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v50, v25, v29 +; GFX11-NEXT: v_or_b32_e32 v52, v21, v33 +; GFX11-NEXT: v_or_b32_e32 v53, v22, v34 +; GFX11-NEXT: v_or_b32_e32 v80, v17, v37 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v146 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v145 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v144 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v118 -; GFX11-NEXT: v_or_b32_e32 v149, v24, v29 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v129 +; GFX11-NEXT: v_or_b32_e32 v51, v26, v30 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v48 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v134 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v23 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v133 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v132 -; GFX11-NEXT: v_or_b32_e32 v19, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 8, v134 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v25 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v130 +; GFX11-NEXT: v_or_b32_e32 v21, v26, v29 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v30 ; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v113 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 8, v35 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v129 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v128 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v119 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v23 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v117 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v25 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 -; GFX11-NEXT: v_or_b32_e32 v23, v28, v29 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v32 -; GFX11-NEXT: v_or_b32_e32 v24, v33, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v26 +; GFX11-NEXT: v_or_b32_e32 v25, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-NEXT: v_or_b32_e32 v26, v34, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_or_b32_e32 v131, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v132, v13, v19 -; GFX11-NEXT: v_or_b32_e32 v133, v14, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v81, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v82, v13, v21 +; GFX11-NEXT: v_or_b32_e32 v83, v14, v22 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v103 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v102 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v31 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v116 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v114 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v112 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v23 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v26 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v98 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v100 ; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX11-NEXT: v_or_b32_e32 v14, v19, v20 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v97 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v27 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v101 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v101 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v86 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v23 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v100 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v99 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v19 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v24 -; GFX11-NEXT: v_or_b32_e32 v19, v26, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v25 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v99 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v98 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v21 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v26 +; GFX11-NEXT: v_or_b32_e32 v21, v27, v23 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v18, v22, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-NEXT: v_or_b32_e32 v13, v9, v13 ; GFX11-NEXT: v_or_b32_e32 v14, v10, v14 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX11-NEXT: v_or_b32_e32 v9, v5, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v5, v21 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v96 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v21 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v18 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v71 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v55 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v51 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v17 +; GFX11-NEXT: v_or_b32_e32 v10, v21, v19 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v36 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v71 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v27 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v35 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 -; GFX11-NEXT: v_or_b32_e32 v16, v18, v17 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 -; GFX11-NEXT: v_or_b32_e32 v17, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v15, v21, v15 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v22 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v16 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11-NEXT: v_or_b32_e32 v10, v5, v6 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[72:75], off -; GFX11-NEXT: scratch_store_b128 v0, v[43:46], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v17 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v19, v16 -; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-NEXT: v_or_b32_e32 v3, v19, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v20, v16 ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_store_b128 v0, v[67:70], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[148:151], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[130:133], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:64 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x13 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: v_readlane_b32 s104, v77, 8 -; GFX11-NEXT: v_readlane_b32 s103, v77, 7 -; GFX11-NEXT: v_readlane_b32 s102, v77, 6 -; GFX11-NEXT: v_readlane_b32 s101, v77, 5 -; GFX11-NEXT: v_readlane_b32 s100, v77, 4 -; GFX11-NEXT: v_readlane_b32 s99, v77, 3 -; GFX11-NEXT: v_readlane_b32 s98, v77, 2 -; GFX11-NEXT: v_readlane_b32 s97, v77, 1 -; GFX11-NEXT: v_readlane_b32 s96, v77, 0 -; GFX11-NEXT: v_readlane_b32 s87, v76, 31 -; GFX11-NEXT: v_readlane_b32 s86, v76, 30 -; GFX11-NEXT: v_readlane_b32 s85, v76, 29 -; GFX11-NEXT: v_readlane_b32 s84, v76, 28 -; GFX11-NEXT: v_readlane_b32 s83, v76, 27 -; GFX11-NEXT: v_readlane_b32 s82, v76, 26 -; GFX11-NEXT: v_readlane_b32 s81, v76, 25 -; GFX11-NEXT: v_readlane_b32 s80, v76, 24 -; GFX11-NEXT: v_readlane_b32 s71, v76, 23 -; GFX11-NEXT: v_readlane_b32 s70, v76, 22 -; GFX11-NEXT: v_readlane_b32 s69, v76, 21 -; GFX11-NEXT: v_readlane_b32 s68, v76, 20 -; GFX11-NEXT: v_readlane_b32 s67, v76, 19 -; GFX11-NEXT: v_readlane_b32 s66, v76, 18 -; GFX11-NEXT: v_readlane_b32 s65, v76, 17 -; GFX11-NEXT: v_readlane_b32 s64, v76, 16 -; GFX11-NEXT: v_readlane_b32 s55, v76, 15 -; GFX11-NEXT: v_readlane_b32 s54, v76, 14 -; GFX11-NEXT: v_readlane_b32 s53, v76, 13 -; GFX11-NEXT: v_readlane_b32 s52, v76, 12 -; GFX11-NEXT: v_readlane_b32 s51, v76, 11 -; GFX11-NEXT: v_readlane_b32 s50, v76, 10 -; GFX11-NEXT: v_readlane_b32 s49, v76, 9 -; GFX11-NEXT: v_readlane_b32 s48, v76, 8 -; GFX11-NEXT: v_readlane_b32 s39, v76, 7 -; GFX11-NEXT: v_readlane_b32 s38, v76, 6 -; GFX11-NEXT: v_readlane_b32 s37, v76, 5 -; GFX11-NEXT: v_readlane_b32 s36, v76, 4 -; GFX11-NEXT: v_readlane_b32 s35, v76, 3 -; GFX11-NEXT: v_readlane_b32 s34, v76, 2 -; GFX11-NEXT: v_readlane_b32 s31, v76, 1 -; GFX11-NEXT: v_readlane_b32 s30, v76, 0 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-NEXT: v_readlane_b32 s30, v74, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:92 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -126635,10 +127119,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248 @@ -126671,7 +127155,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124 @@ -126732,25 +127216,25 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) @@ -126791,253 +127275,262 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff @@ -127065,18 +127558,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_3 @@ -127255,7 +127738,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -127329,9 +127812,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -127365,13 +127848,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -127576,10 +128059,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:232 ; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 @@ -127612,7 +128095,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 ; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:140 ; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 @@ -127673,25 +128156,25 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v99 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) @@ -127732,253 +128215,262 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v134 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v129 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff @@ -128006,18 +128498,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_3 @@ -128196,7 +128678,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v119 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 @@ -128270,9 +128752,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v145, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v131, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v132, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -128306,13 +128788,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v135, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v144, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v128, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v129, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v130, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -129949,26 +130431,26 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -130013,26 +130495,26 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -130871,9 +131353,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130889,9 +131371,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130907,9 +131389,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130925,9 +131407,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130943,9 +131425,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130961,9 +131443,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130979,9 +131461,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -130997,9 +131479,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131015,9 +131497,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131033,9 +131515,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131051,9 +131533,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131069,9 +131551,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131087,9 +131569,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131105,9 +131587,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131123,9 +131605,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131141,10 +131623,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131160,9 +131642,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131178,9 +131660,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131196,9 +131678,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131214,9 +131696,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131232,9 +131714,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131250,9 +131732,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131268,9 +131750,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131286,9 +131768,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131304,9 +131786,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131322,9 +131804,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131340,9 +131822,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131358,9 +131840,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131376,9 +131858,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131394,9 +131876,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131412,9 +131894,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 @@ -131464,9 +131946,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131479,9 +131961,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131494,9 +131976,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131509,9 +131991,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131524,9 +132006,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131539,9 +132021,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131554,9 +132036,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131569,9 +132051,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131584,9 +132066,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131599,9 +132081,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131614,9 +132096,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131629,9 +132111,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131644,9 +132126,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131659,9 +132141,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131674,9 +132156,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131689,10 +132171,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131705,9 +132187,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131720,9 +132202,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131735,9 +132217,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131750,9 +132232,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131765,9 +132247,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131780,9 +132262,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131795,9 +132277,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131810,9 +132292,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131825,9 +132307,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131840,9 +132322,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131855,9 +132337,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131870,9 +132352,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131885,9 +132367,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131900,9 +132382,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -131915,9 +132397,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 @@ -133684,8 +134166,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -133718,9 +134200,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133736,9 +134218,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133754,9 +134236,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133772,9 +134254,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133790,9 +134272,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133808,9 +134290,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133826,9 +134308,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133844,9 +134326,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133862,9 +134344,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133880,9 +134362,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133898,9 +134380,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133916,9 +134398,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133934,9 +134416,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133952,9 +134434,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133970,9 +134452,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -133988,9 +134470,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 ; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134006,9 +134488,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134024,9 +134506,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134042,9 +134524,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134060,9 +134542,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134078,9 +134560,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134096,9 +134578,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 ; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134114,9 +134596,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134132,9 +134614,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 ; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134150,9 +134632,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 ; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134168,9 +134650,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134186,9 +134668,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134204,9 +134686,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134222,9 +134704,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 ; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134240,9 +134722,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134258,9 +134740,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -134307,8 +134789,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -134944,864 +135426,1020 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 ; GFX11-NEXT: .LBB79_2: ; %cmp.true -; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s4, s27, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 -; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-NEXT: s_and_b32 s5, s24, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_lshl_b32 s4, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v30, v0, 16, v1 +; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v3 :: v_dual_add_nc_u32 v3, v6, v1 +; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v11 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v13, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v9, v8 ; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v15, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v12, v10 ; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v15 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v10, v16, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 ; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v16 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v12 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_bfe_u32 v12, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v124, v3, 16, v5 +; GFX11-NEXT: v_lshl_or_b32 v112, v6, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v15 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v11, v12, vcc_lo +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshl_or_b32 v101, v8, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v137, v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX11-NEXT: v_lshl_or_b32 v91, v9, 16, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo +; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v82, v11, 16, v13 +; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v16 ; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-NEXT: v_lshl_or_b32 v74, v15, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 ; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v19 +; GFX11-NEXT: v_lshl_or_b32 v67, v17, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v21, v21, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v22, v22, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-NEXT: v_lshl_or_b32 v61, v19, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v23 +; GFX11-NEXT: v_lshl_or_b32 v56, v21, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s2 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v25, v25, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s1 ; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_lshl_or_b32 v52, v23, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v27, v27, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v27 +; GFX11-NEXT: v_lshl_or_b32 v49, v25, 16, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v29, v29, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v29 +; GFX11-NEXT: v_dual_cndmask_b32 v28, v29, v31 :: v_dual_lshlrev_b32 v29, 16, v176 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v47, v27, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v176, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v177 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v177 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v177, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v178 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v179 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v179 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v179, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v180 +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v180 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v181 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v181 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-NEXT: v_lshl_or_b32 v181, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v182 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v183 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-NEXT: v_lshl_or_b32 v183, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v170 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v170 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_lshl_or_b32 v170, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v171 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v171 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v171, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v172 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v172 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v172, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v173 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v173 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v173, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v174 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v174 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v174, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v175 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v175 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v175, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v185 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v185 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v185, v31, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v184 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v184 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_lshl_or_b32 v184, v31, 16, v29 ; GFX11-NEXT: .LBB79_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB79_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 ; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 ; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 ; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 ; GFX11-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -137329,26 +137967,26 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -137393,26 +138031,26 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -139208,8 +139846,8 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -139352,8 +139990,8 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -139413,107 +140051,109 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 ; GFX11-NEXT: .LBB83_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] @@ -139522,142 +140162,142 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v138, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v126, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v115, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v105, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v96, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v88, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v81, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v75, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v70, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v66, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v63, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB83_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB83_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB83_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -140177,10 +140817,10 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: v_mov_b32_e32 v30, s17 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: v_mov_b32_e32 v32, s19 ; SI-NEXT: v_mov_b32_e32 v27, s20 ; SI-NEXT: v_mov_b32_e32 v28, s21 ; SI-NEXT: v_mov_b32_e32 v25, s22 @@ -140226,9 +140866,9 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 ; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v58, v32, v31, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v60, v30, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 @@ -140246,14 +140886,14 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; SI-NEXT: s_cbranch_execnz .LBB85_3 ; SI-NEXT: .LBB85_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 @@ -140283,9 +140923,9 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 ; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v58, v32, v31, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v60, v30, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 @@ -140303,29 +140943,29 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; SI-NEXT: .LBB85_3: ; %end ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v60 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v29, v29, v60 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v58 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 @@ -140559,26 +141199,26 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 -; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -140623,26 +141263,26 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 -; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -142261,8 +142901,8 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -142321,107 +142961,109 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:296 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:172 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:44 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 ; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 ; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 ; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v171, v5 :: v_dual_mov_b32 v172, v4 +; GFX11-NEXT: v_dual_mov_b32 v173, v3 :: v_dual_mov_b32 v174, v2 +; GFX11-NEXT: v_dual_mov_b32 v175, v1 :: v_dual_mov_b32 v184, v0 +; GFX11-NEXT: v_dual_mov_b32 v185, s28 :: v_dual_mov_b32 v186, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v61, s0 :: v_dual_mov_b32 v66, s2 +; GFX11-NEXT: v_dual_mov_b32 v63, s1 :: v_dual_mov_b32 v70, s3 +; GFX11-NEXT: v_dual_mov_b32 v75, s16 :: v_dual_mov_b32 v88, s18 +; GFX11-NEXT: v_dual_mov_b32 v81, s17 :: v_dual_mov_b32 v96, s19 +; GFX11-NEXT: v_dual_mov_b32 v105, s20 :: v_dual_mov_b32 v126, s22 +; GFX11-NEXT: v_dual_mov_b32 v115, s21 :: v_dual_mov_b32 v138, s23 +; GFX11-NEXT: v_dual_mov_b32 v151, s24 :: v_dual_mov_b32 v28, s26 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v44, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 ; GFX11-NEXT: .LBB87_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] @@ -142430,142 +143072,142 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v138, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v126, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v115, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v105, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v96, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v88, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v81, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v75, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v70, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v66, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v63, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB87_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v0, v61 :: v_dual_mov_b32 v1, v63 +; GFX11-NEXT: v_dual_mov_b32 v3, v70 :: v_dual_mov_b32 v4, v75 +; GFX11-NEXT: v_dual_mov_b32 v6, v88 :: v_dual_mov_b32 v9, v115 +; GFX11-NEXT: v_dual_mov_b32 v7, v96 :: v_dual_mov_b32 v8, v105 +; GFX11-NEXT: v_dual_mov_b32 v10, v126 :: v_dual_mov_b32 v15, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v138 :: v_dual_mov_b32 v12, v151 +; GFX11-NEXT: v_dual_mov_b32 v14, v28 :: v_dual_mov_b32 v17, v186 +; GFX11-NEXT: v_dual_mov_b32 v16, v185 :: v_dual_mov_b32 v19, v175 +; GFX11-NEXT: v_dual_mov_b32 v18, v184 :: v_dual_mov_b32 v21, v173 +; GFX11-NEXT: v_dual_mov_b32 v20, v174 :: v_dual_mov_b32 v23, v171 +; GFX11-NEXT: v_dual_mov_b32 v22, v172 :: v_dual_mov_b32 v25, v182 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v186, off, s32 +; GFX11-NEXT: scratch_load_b32 v185, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:124 ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:252 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:292 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:296 +; GFX11-NEXT: v_dual_mov_b32 v2, v66 :: v_dual_mov_b32 v5, v81 +; GFX11-NEXT: v_dual_mov_b32 v24, v183 :: v_dual_mov_b32 v27, v180 +; GFX11-NEXT: v_dual_mov_b32 v26, v181 :: v_dual_mov_b32 v29, v178 +; GFX11-NEXT: v_dual_mov_b32 v28, v179 :: v_dual_mov_b32 v31, v176 +; GFX11-NEXT: v_mov_b32_e32 v30, v177 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB87_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92 ; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137 +; GFX11-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158 +; GFX11-NEXT: ; implicit-def: $vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170 ; GFX11-NEXT: s_branch .LBB87_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -148907,11 +149549,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -148920,794 +149570,733 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 +; SI-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v43, s19, 0 -; SI-NEXT: v_writelane_b32 v43, s18, 1 -; SI-NEXT: v_writelane_b32 v43, s17, 2 -; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 -; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: v_writelane_b32 v41, s31, 1 -; SI-NEXT: v_writelane_b32 v41, s34, 2 -; SI-NEXT: v_writelane_b32 v41, s35, 3 -; SI-NEXT: v_writelane_b32 v41, s36, 4 -; SI-NEXT: v_writelane_b32 v41, s37, 5 -; SI-NEXT: v_writelane_b32 v41, s38, 6 -; SI-NEXT: v_writelane_b32 v41, s39, 7 -; SI-NEXT: v_writelane_b32 v41, s48, 8 -; SI-NEXT: v_writelane_b32 v41, s49, 9 -; SI-NEXT: v_writelane_b32 v41, s50, 10 -; SI-NEXT: v_writelane_b32 v41, s51, 11 -; SI-NEXT: v_writelane_b32 v41, s52, 12 -; SI-NEXT: v_writelane_b32 v41, s53, 13 -; SI-NEXT: v_writelane_b32 v41, s54, 14 -; SI-NEXT: v_writelane_b32 v41, s55, 15 -; SI-NEXT: v_writelane_b32 v41, s64, 16 -; SI-NEXT: v_writelane_b32 v41, s65, 17 -; SI-NEXT: v_writelane_b32 v41, s66, 18 -; SI-NEXT: v_writelane_b32 v41, s67, 19 -; SI-NEXT: v_writelane_b32 v41, s68, 20 -; SI-NEXT: v_writelane_b32 v41, s69, 21 -; SI-NEXT: v_writelane_b32 v41, s70, 22 -; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s77, s28 -; SI-NEXT: s_mov_b32 s76, s27 -; SI-NEXT: v_writelane_b32 v41, s80, 24 -; SI-NEXT: v_writelane_b32 v41, s81, 25 -; SI-NEXT: v_writelane_b32 v41, s82, 26 -; SI-NEXT: v_writelane_b32 v41, s83, 27 -; SI-NEXT: v_writelane_b32 v41, s84, 28 -; SI-NEXT: v_writelane_b32 v41, s85, 29 -; SI-NEXT: v_writelane_b32 v41, s86, 30 -; SI-NEXT: v_writelane_b32 v41, s87, 31 -; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: v_writelane_b32 v41, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_mov_b32 s79, s26 -; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v42, s38, 0 -; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v42, s39, 1 -; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v42, s48, 2 -; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v42, s49, 3 -; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v42, s50, 4 -; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v42, s51, 5 -; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v42, s52, 6 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v42, s53, 7 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v42, s54, 8 -; SI-NEXT: v_writelane_b32 v42, s55, 9 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s6, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v43, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 13 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 +; SI-NEXT: v_writelane_b32 v57, s30, 0 +; SI-NEXT: v_writelane_b32 v57, s31, 1 +; SI-NEXT: v_writelane_b32 v57, s34, 2 +; SI-NEXT: v_writelane_b32 v57, s35, 3 +; SI-NEXT: v_writelane_b32 v57, s36, 4 +; SI-NEXT: v_writelane_b32 v57, s37, 5 +; SI-NEXT: v_writelane_b32 v57, s38, 6 +; SI-NEXT: v_writelane_b32 v57, s39, 7 +; SI-NEXT: v_writelane_b32 v57, s48, 8 +; SI-NEXT: v_writelane_b32 v57, s49, 9 +; SI-NEXT: v_writelane_b32 v57, s50, 10 +; SI-NEXT: v_writelane_b32 v57, s51, 11 +; SI-NEXT: v_writelane_b32 v57, s52, 12 +; SI-NEXT: v_writelane_b32 v57, s53, 13 +; SI-NEXT: v_writelane_b32 v57, s54, 14 +; SI-NEXT: v_writelane_b32 v57, s55, 15 +; SI-NEXT: v_writelane_b32 v57, s64, 16 +; SI-NEXT: v_writelane_b32 v57, s65, 17 +; SI-NEXT: v_writelane_b32 v57, s66, 18 +; SI-NEXT: v_writelane_b32 v57, s67, 19 +; SI-NEXT: v_writelane_b32 v57, s68, 20 +; SI-NEXT: v_writelane_b32 v57, s69, 21 +; SI-NEXT: v_writelane_b32 v57, s70, 22 +; SI-NEXT: v_writelane_b32 v57, s71, 23 +; SI-NEXT: v_writelane_b32 v57, s80, 24 +; SI-NEXT: v_writelane_b32 v57, s81, 25 +; SI-NEXT: v_writelane_b32 v57, s82, 26 +; SI-NEXT: v_writelane_b32 v57, s83, 27 +; SI-NEXT: v_writelane_b32 v57, s84, 28 +; SI-NEXT: v_writelane_b32 v57, s85, 29 +; SI-NEXT: v_writelane_b32 v57, s86, 30 +; SI-NEXT: v_writelane_b32 v57, s87, 31 +; SI-NEXT: v_writelane_b32 v57, s96, 32 +; SI-NEXT: ; implicit-def: $vgpr58 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v57, s97, 33 +; SI-NEXT: v_writelane_b32 v57, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s97, v28 +; SI-NEXT: v_readfirstlane_b32 s98, v27 +; SI-NEXT: v_readfirstlane_b32 s79, v24 +; SI-NEXT: v_readfirstlane_b32 s81, v23 +; SI-NEXT: v_readfirstlane_b32 s39, v8 +; SI-NEXT: v_readfirstlane_b32 s50, v7 +; SI-NEXT: v_readfirstlane_b32 s34, v6 +; SI-NEXT: v_readfirstlane_b32 s35, v5 +; SI-NEXT: v_readfirstlane_b32 s37, v4 +; SI-NEXT: v_readfirstlane_b32 s38, v3 +; SI-NEXT: v_readfirstlane_b32 s31, v1 +; SI-NEXT: v_writelane_b32 v57, s99, 35 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 +; SI-NEXT: v_readfirstlane_b32 s8, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 +; SI-NEXT: v_readfirstlane_b32 s43, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 +; SI-NEXT: v_readfirstlane_b32 s44, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 +; SI-NEXT: v_readfirstlane_b32 s11, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s13, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 +; SI-NEXT: v_readfirstlane_b32 s41, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 +; SI-NEXT: v_readfirstlane_b32 s47, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 +; SI-NEXT: v_readfirstlane_b32 s10, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 +; SI-NEXT: v_readfirstlane_b32 s12, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 14 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 +; SI-NEXT: v_writelane_b32 v59, s4, 0 ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 +; SI-NEXT: v_writelane_b32 v59, s4, 1 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v59, s4, 2 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v59, s4, 3 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v59, s4, 4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v59, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s14, v32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_writelane_b32 v59, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v59, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v59, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s40, v35 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_writelane_b32 v59, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v59, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v59, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v59, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s45, v48 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s78, v34 -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:128 +; SI-NEXT: v_writelane_b32 v59, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: v_writelane_b32 v59, s4, 14 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v43, s4, 27 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 +; SI-NEXT: v_writelane_b32 v59, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:116 +; SI-NEXT: v_writelane_b32 v59, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:336 +; SI-NEXT: v_writelane_b32 v59, s4, 17 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_writelane_b32 v59, s4, 18 ; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_writelane_b32 v59, s4, 19 ; SI-NEXT: v_readfirstlane_b32 s4, v40 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 -; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: v_writelane_b32 v43, s17, 45 -; SI-NEXT: v_writelane_b32 v43, s18, 46 -; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 -; SI-NEXT: v_writelane_b32 v43, s89, 49 -; SI-NEXT: v_writelane_b32 v43, s90, 50 -; SI-NEXT: v_writelane_b32 v43, s91, 51 -; SI-NEXT: v_writelane_b32 v43, s92, 52 -; SI-NEXT: v_writelane_b32 v43, s93, 53 -; SI-NEXT: v_writelane_b32 v43, s94, 54 -; SI-NEXT: v_writelane_b32 v43, s95, 55 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s10, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s66, v35 -; SI-NEXT: v_readfirstlane_b32 s28, v31 -; SI-NEXT: v_readfirstlane_b32 s27, v32 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s69, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s14, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s68, v39 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s11, v49 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s70, v50 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s71, v51 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: v_writelane_b32 v59, s4, 20 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_writelane_b32 v59, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: v_writelane_b32 v59, s4, 22 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_writelane_b32 v59, s4, 23 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_writelane_b32 v59, s4, 24 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_writelane_b32 v59, s4, 25 +; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: v_writelane_b32 v59, s4, 26 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_writelane_b32 v59, s4, 27 +; SI-NEXT: v_readfirstlane_b32 s58, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s36, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s52, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s49, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s51, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s96, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s91, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s30, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s48, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s53, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s80, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s95, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s88, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s83, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 -; SI-NEXT: v_writelane_b32 v43, s30, 58 -; SI-NEXT: v_writelane_b32 v43, s31, 59 -; SI-NEXT: v_writelane_b32 v43, s34, 60 -; SI-NEXT: v_writelane_b32 v43, s35, 61 -; SI-NEXT: v_writelane_b32 v43, s36, 62 -; SI-NEXT: v_writelane_b32 v43, s37, 63 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s96, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s41, v35 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s87, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s99, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s81, v39 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s82, v50 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s7, v51 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s93, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s70, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s90, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s63, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s72, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s66, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s69, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s55, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s61, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s71, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s65, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s92, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s68, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s42, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s67, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s62, v31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s97, v32 +; SI-NEXT: v_readfirstlane_b32 s86, v34 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s44, v33 +; SI-NEXT: v_readfirstlane_b32 s54, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s9, v34 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v37 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v38 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s86, v36 +; SI-NEXT: v_readfirstlane_b32 s84, v39 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s85, v37 +; SI-NEXT: v_readfirstlane_b32 s85, v48 +; SI-NEXT: v_writelane_b32 v58, s85, 0 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s8, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v59, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s12, v39 +; SI-NEXT: v_readfirstlane_b32 s87, v33 +; SI-NEXT: v_writelane_b32 v58, s86, 1 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s65, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v59, s4, 29 +; SI-NEXT: v_readfirstlane_b32 s4, v30 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v42, s64, 10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v42, s65, 11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v42, s67, 12 -; SI-NEXT: v_writelane_b32 v42, s84, 13 -; SI-NEXT: v_writelane_b32 v42, s85, 14 -; SI-NEXT: v_writelane_b32 v42, s86, 15 -; SI-NEXT: v_writelane_b32 v42, s87, 16 -; SI-NEXT: v_writelane_b32 v42, s8, 17 -; SI-NEXT: v_writelane_b32 v42, s99, 18 -; SI-NEXT: v_writelane_b32 v42, s12, 19 -; SI-NEXT: v_writelane_b32 v42, s44, 20 -; SI-NEXT: v_writelane_b32 v42, s97, 21 -; SI-NEXT: v_writelane_b32 v42, s83, 22 -; SI-NEXT: v_writelane_b32 v42, s82, 23 -; SI-NEXT: v_writelane_b32 v42, s98, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 25 -; SI-NEXT: v_writelane_b32 v42, s81, 26 -; SI-NEXT: v_writelane_b32 v42, s9, 27 -; SI-NEXT: v_writelane_b32 v42, s41, 28 -; SI-NEXT: v_writelane_b32 v42, s80, 29 -; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 -; SI-NEXT: v_writelane_b32 v42, s15, 33 -; SI-NEXT: v_writelane_b32 v42, s14, 34 -; SI-NEXT: v_writelane_b32 v42, s69, 35 -; SI-NEXT: v_writelane_b32 v42, s71, 36 -; SI-NEXT: v_writelane_b32 v42, s70, 37 -; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 -; SI-NEXT: v_writelane_b32 v42, s11, 41 -; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 -; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 -; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_readfirstlane_b32 s15, v35 +; SI-NEXT: v_writelane_b32 v59, s4, 30 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: v_writelane_b32 v58, s87, 2 +; SI-NEXT: v_writelane_b32 v59, s4, 31 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_writelane_b32 v58, s15, 3 +; SI-NEXT: v_writelane_b32 v59, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_writelane_b32 v58, s42, 4 +; SI-NEXT: v_writelane_b32 v59, s4, 33 +; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: v_writelane_b32 v58, s67, 5 +; SI-NEXT: v_writelane_b32 v59, s4, 34 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_writelane_b32 v58, s54, 6 +; SI-NEXT: v_writelane_b32 v59, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_writelane_b32 v58, s55, 7 +; SI-NEXT: v_writelane_b32 v59, s4, 36 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_writelane_b32 v58, s61, 8 +; SI-NEXT: v_writelane_b32 v59, s4, 37 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_writelane_b32 v58, s63, 9 +; SI-NEXT: v_writelane_b32 v59, s4, 38 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_writelane_b32 v58, s72, 10 +; SI-NEXT: v_writelane_b32 v59, s4, 39 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_writelane_b32 v58, s62, 11 +; SI-NEXT: v_writelane_b32 v59, s4, 40 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_writelane_b32 v58, s64, 12 +; SI-NEXT: v_writelane_b32 v59, s4, 41 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_writelane_b32 v58, s65, 13 +; SI-NEXT: v_writelane_b32 v59, s4, 42 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_writelane_b32 v58, s66, 14 +; SI-NEXT: v_writelane_b32 v59, s4, 43 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_writelane_b32 v58, s68, 15 +; SI-NEXT: v_writelane_b32 v59, s4, 44 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_writelane_b32 v58, s69, 16 +; SI-NEXT: v_writelane_b32 v59, s4, 45 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_writelane_b32 v58, s70, 17 +; SI-NEXT: v_writelane_b32 v59, s4, 46 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_writelane_b32 v58, s71, 18 +; SI-NEXT: v_writelane_b32 v59, s4, 47 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_writelane_b32 v58, s80, 19 +; SI-NEXT: v_writelane_b32 v59, s4, 48 +; SI-NEXT: v_writelane_b32 v58, s88, 20 +; SI-NEXT: v_writelane_b32 v59, s18, 49 +; SI-NEXT: v_writelane_b32 v58, s96, 21 +; SI-NEXT: v_writelane_b32 v59, s19, 50 +; SI-NEXT: v_writelane_b32 v58, s90, 22 +; SI-NEXT: v_writelane_b32 v59, s17, 51 +; SI-NEXT: v_writelane_b32 v58, s91, 23 +; SI-NEXT: v_writelane_b32 v59, s16, 52 +; SI-NEXT: v_writelane_b32 v58, s83, 24 +; SI-NEXT: v_writelane_b32 v59, s22, 53 +; SI-NEXT: v_writelane_b32 v58, s92, 25 +; SI-NEXT: v_writelane_b32 v59, s23, 54 +; SI-NEXT: v_writelane_b32 v58, s93, 26 +; SI-NEXT: v_writelane_b32 v59, s21, 55 +; SI-NEXT: v_writelane_b32 v58, s94, 27 +; SI-NEXT: v_writelane_b32 v59, s20, 56 +; SI-NEXT: v_writelane_b32 v58, s95, 28 +; SI-NEXT: v_writelane_b32 v59, s28, 57 +; SI-NEXT: v_writelane_b32 v58, s30, 29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s9, v51 +; SI-NEXT: v_writelane_b32 v59, s58, 58 +; SI-NEXT: v_writelane_b32 v58, s36, 30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s40, v52 +; SI-NEXT: v_writelane_b32 v59, s9, 59 +; SI-NEXT: v_writelane_b32 v58, s48, 31 +; SI-NEXT: v_writelane_b32 v59, s40, 60 +; SI-NEXT: v_writelane_b32 v58, s49, 32 +; SI-NEXT: v_writelane_b32 v59, vcc_lo, 61 +; SI-NEXT: v_writelane_b32 v58, s51, 33 +; SI-NEXT: v_writelane_b32 v59, vcc_hi, 62 +; SI-NEXT: v_writelane_b32 v58, s52, 34 +; SI-NEXT: v_writelane_b32 v59, s84, 63 +; SI-NEXT: v_writelane_b32 v58, s53, 35 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 3 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s17, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 -; SI-NEXT: v_readlane_b32 s4, v43, 1 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: v_writelane_b32 v58, s4, 36 +; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s60, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v58, s5, 37 +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_or_b32 s20, s6, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 -; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s23, s6, s5 +; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s6, s29, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_and_b32 s6, s31, 0xff +; SI-NEXT: v_readlane_b32 s16, v59, 48 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 -; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_or_b32 s28, s16, s6 +; SI-NEXT: s_and_b32 s6, s38, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 -; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s16, s37, 24 +; SI-NEXT: s_or_b32 s57, s16, s6 +; SI-NEXT: s_and_b32 s6, s35, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 -; SI-NEXT: s_and_b32 s6, s93, 0xff -; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_lshl_b32 s16, s34, 24 +; SI-NEXT: s_or_b32 s73, s16, s6 +; SI-NEXT: s_and_b32 s6, s50, 0xff +; SI-NEXT: s_lshl_b32 s16, s39, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 -; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: v_readlane_b32 s16, v59, 47 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s17, v59, 46 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 -; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v59, 45 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s17, v59, 44 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_or_b32 s56, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v59, 43 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s17, v59, 42 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 -; SI-NEXT: s_and_b32 s16, s35, 0xff -; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_or_b32 s75, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v59, 41 +; SI-NEXT: v_readlane_b32 s17, v59, 40 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: v_readlane_b32 s17, v59, 39 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v59, 38 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s76, s18, s17 +; SI-NEXT: v_readlane_b32 s17, v59, 37 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v59, 36 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s77, s18, s17 -; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: v_readlane_b32 s17, v59, 35 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v59, 34 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 -; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s78, s18, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s18, s79, 8 +; SI-NEXT: s_or_b32 s18, s17, s18 +; SI-NEXT: v_readlane_b32 s17, v59, 33 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s19, v59, 32 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_mov_b32 s82, s81 +; SI-NEXT: s_mov_b32 s81, s79 +; SI-NEXT: s_or_b32 s79, s19, s17 +; SI-NEXT: s_and_b32 s17, s98, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s97, 24 +; SI-NEXT: s_or_b32 s59, s19, s17 +; SI-NEXT: v_readlane_b32 s17, v59, 31 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s19, v59, 30 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_or_b32 s89, s19, s17 +; SI-NEXT: s_and_b32 s17, s40, 0xff +; SI-NEXT: s_lshl_b32 s19, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v59, 29 +; SI-NEXT: s_or_b32 s19, s17, s19 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 28 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: s_or_b32 s9, s21, s17 +; SI-NEXT: s_and_b32 s17, s85, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s84, 24 +; SI-NEXT: s_or_b32 s40, s21, s17 +; SI-NEXT: s_and_b32 s17, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, vcc_hi, 24 +; SI-NEXT: s_or_b32 s85, s21, s17 +; SI-NEXT: s_and_b32 s17, s54, 0xff +; SI-NEXT: s_lshl_b32 s21, s15, 8 +; SI-NEXT: s_or_b32 vcc_lo, s17, s21 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s87, 24 +; SI-NEXT: s_or_b32 s15, s21, s17 +; SI-NEXT: s_and_b32 s17, s64, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s62, 24 +; SI-NEXT: s_or_b32 s99, s21, s17 +; SI-NEXT: s_and_b32 s17, s67, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s42, 24 +; SI-NEXT: s_or_b32 s22, s21, s17 +; SI-NEXT: s_and_b32 s17, s68, 0xff +; SI-NEXT: s_lshl_b32 s21, s65, 8 +; SI-NEXT: s_or_b32 vcc_hi, s17, s21 +; SI-NEXT: s_and_b32 s17, s61, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s55, 24 +; SI-NEXT: s_or_b32 s55, s21, s17 +; SI-NEXT: s_and_b32 s17, s69, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_lshl_b32 s21, s66, 24 +; SI-NEXT: s_or_b32 s65, s21, s17 +; SI-NEXT: s_and_b32 s17, s72, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s63, 24 +; SI-NEXT: s_or_b32 s66, s21, s17 +; SI-NEXT: s_and_b32 s17, s92, 0xff +; SI-NEXT: s_lshl_b32 s21, s90, 8 +; SI-NEXT: s_or_b32 s84, s17, s21 +; SI-NEXT: s_and_b32 s17, s71, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s70, 24 +; SI-NEXT: s_or_b32 s68, s21, s17 +; SI-NEXT: s_and_b32 s17, s93, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s83, 24 +; SI-NEXT: s_or_b32 s69, s21, s17 +; SI-NEXT: s_and_b32 s17, s88, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s80, 24 +; SI-NEXT: s_or_b32 s70, s21, s17 +; SI-NEXT: s_and_b32 s17, s48, 0xff +; SI-NEXT: s_lshl_b32 s21, s30, 8 +; SI-NEXT: s_mov_b32 s93, s85 +; SI-NEXT: s_or_b32 s85, s17, s21 +; SI-NEXT: s_and_b32 s17, s91, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s96, 24 +; SI-NEXT: s_or_b32 s30, s21, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff -; SI-NEXT: s_lshl_b32 s18, s50, 8 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s48, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 -; SI-NEXT: s_and_b32 s18, s55, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 -; SI-NEXT: s_and_b32 s18, s52, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 -; SI-NEXT: s_and_b32 s18, s84, 0xff -; SI-NEXT: s_lshl_b32 s19, s67, 8 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s19, s64, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 -; SI-NEXT: s_and_b32 s19, s12, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s8, 24 -; SI-NEXT: s_or_b32 s8, s20, s19 -; SI-NEXT: s_and_b32 s19, s85, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s80, 0xff -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: s_or_b32 vcc_lo, s19, s20 -; SI-NEXT: s_and_b32 s19, s44, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s15, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s82, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s81, 8 -; SI-NEXT: s_or_b32 vcc_hi, s19, s20 -; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s98, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 -; SI-NEXT: s_or_b32 s84, s19, s20 -; SI-NEXT: s_and_b32 s19, s71, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s14, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff -; SI-NEXT: s_lshl_b32 s20, s66, 8 -; SI-NEXT: s_or_b32 s85, s19, s20 -; SI-NEXT: s_and_b32 s19, s10, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 -; SI-NEXT: s_or_b32 s49, s20, s19 -; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 -; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 -; SI-NEXT: s_or_b32 s51, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 -; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 -; SI-NEXT: s_or_b32 s52, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 -; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 -; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 -; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 -; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 -; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 -; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 -; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 -; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 -; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 -; SI-NEXT: s_or_b32 s27, s19, s20 -; SI-NEXT: s_and_b32 s19, s40, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s57, s7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 16 -; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_mov_b32 s10, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 15 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s71, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 -; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 12 -; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s14, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 11 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 10 -; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 9 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s81, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 8 -; SI-NEXT: s_or_b32 s11, s20, s19 -; SI-NEXT: s_mov_b32 s82, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s96, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 -; SI-NEXT: s_mov_b32 s98, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 5 -; SI-NEXT: s_mov_b32 s44, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_or_b32 s21, s19, s20 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s49, 24 +; SI-NEXT: s_or_b32 s42, s21, s17 +; SI-NEXT: s_and_b32 s17, s95, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s94, 24 +; SI-NEXT: s_or_b32 s94, s21, s17 +; SI-NEXT: s_and_b32 s17, s53, 0xff +; SI-NEXT: s_lshl_b32 s21, s52, 8 +; SI-NEXT: s_or_b32 s86, s17, s21 +; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_mov_b32 s91, s9 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s58, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 27 +; SI-NEXT: s_or_b32 s95, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 26 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 25 +; SI-NEXT: s_or_b32 s48, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 23 +; SI-NEXT: s_or_b32 s51, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 22 +; SI-NEXT: s_lshl_b32 s21, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v59, 21 +; SI-NEXT: s_or_b32 s87, s17, s21 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 20 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 19 +; SI-NEXT: s_or_b32 s52, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 18 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 17 +; SI-NEXT: s_mov_b32 s88, s79 +; SI-NEXT: s_or_b32 s79, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 15 +; SI-NEXT: s_or_b32 s53, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 14 +; SI-NEXT: s_lshl_b32 s21, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v59, 13 +; SI-NEXT: s_mov_b32 s72, s28 +; SI-NEXT: s_or_b32 s28, s17, s21 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s45, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 12 +; SI-NEXT: s_or_b32 s71, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 11 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 10 +; SI-NEXT: s_or_b32 s80, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 9 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 8 +; SI-NEXT: s_or_b32 s83, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 7 +; SI-NEXT: s_lshl_b32 s21, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v59, 6 +; SI-NEXT: s_mov_b32 s61, s20 +; SI-NEXT: s_or_b32 s20, s17, s21 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s14, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 5 +; SI-NEXT: s_or_b32 s96, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 3 +; SI-NEXT: s_or_b32 s64, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 2 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v59, 1 +; SI-NEXT: s_or_b32 s49, s21, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v59, 0 +; SI-NEXT: s_lshl_b32 s21, s9, 8 +; SI-NEXT: s_or_b32 s9, s17, s21 +; SI-NEXT: s_and_b32 s17, s12, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s10, 24 +; SI-NEXT: s_or_b32 s62, s21, s17 +; SI-NEXT: s_and_b32 s17, s47, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s41, 24 +; SI-NEXT: s_or_b32 s54, s21, s17 +; SI-NEXT: s_and_b32 s17, s13, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s11, 24 +; SI-NEXT: s_or_b32 s46, s21, s17 +; SI-NEXT: s_and_b32 s17, s44, 0xff +; SI-NEXT: s_lshl_b32 s21, s43, 8 +; SI-NEXT: s_or_b32 s58, s17, s21 +; SI-NEXT: s_and_b32 s17, s8, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s7, 24 +; SI-NEXT: s_mov_b32 s92, s40 +; SI-NEXT: s_or_b32 s40, s21, s17 +; SI-NEXT: s_mov_b32 s63, s23 +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_lshl_b32 s23, s4, 16 +; SI-NEXT: s_lshl_b32 s21, s5, 16 +; SI-NEXT: s_lshl_b32 s67, s6, 16 +; SI-NEXT: s_lshl_b32 s17, s16, 16 +; SI-NEXT: s_lshl_b32 s16, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s83, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 -; SI-NEXT: s_lshl_b32 s22, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s19, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s18, vcc_lo, 16 ; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 -; SI-NEXT: s_lshl_b32 s99, s84, 16 -; SI-NEXT: s_lshl_b32 s8, s85, 16 -; SI-NEXT: s_lshl_b32 s97, s86, 16 -; SI-NEXT: s_lshl_b32 s28, s87, 16 -; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 -; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 -; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 -; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: s_lshl_b32 s84, s84, 16 +; SI-NEXT: s_lshl_b32 s85, s85, 16 +; SI-NEXT: s_lshl_b32 s86, s86, 16 +; SI-NEXT: s_lshl_b32 s87, s87, 16 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_mov_b32 s36, s15 +; SI-NEXT: s_mov_b32 s15, s79 +; SI-NEXT: s_mov_b32 s79, s81 +; SI-NEXT: s_mov_b32 s81, s82 +; SI-NEXT: s_mov_b32 s82, s40 +; SI-NEXT: s_mov_b32 s40, s64 +; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s98, 3 +; SI-NEXT: s_add_i32 s4, s44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_lshl_b32 s5, s43, 8 +; SI-NEXT: s_add_i32 s6, s8, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 24 @@ -149716,952 +150305,920 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s47, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 -; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_lshl_b32 s6, s41, 8 +; SI-NEXT: s_add_i32 s16, s13, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s6, s11, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: v_readlane_b32 s6, v59, 1 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 0 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s41, 8 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_lshl_b32 s16, s7, 8 +; SI-NEXT: s_add_i32 s17, s12, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s16, s9, 24 +; SI-NEXT: s_lshl_b32 s16, s10, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 5 ; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_add_i32 s16, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 -; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_lshl_b32 s17, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v59, 2 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s17, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v59, 3 +; SI-NEXT: s_add_i32 s18, s7, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s17, s71, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 8 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 7 +; SI-NEXT: s_lshl_b32 s18, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v59, 6 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_add_i32 s19, s7, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s14, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 12 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 11 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v59, 9 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 -; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s19, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v59, 10 +; SI-NEXT: s_add_i32 s21, s7, 3 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_or_b32 s19, s19, s21 ; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 15 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 14 +; SI-NEXT: s_lshl_b32 s21, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v59, 13 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 -; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_add_i32 s22, s7, 3 +; SI-NEXT: s_or_b32 s19, s21, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s21, s45, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 18 -; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: v_readlane_b32 s7, v59, 19 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_add_i32 s21, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v59, 18 +; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 -; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: v_readlane_b32 s7, v59, 16 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 +; SI-NEXT: v_readlane_b32 s7, v59, 17 ; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s21, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 23 -; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 23 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_add_i32 s22, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: v_readlane_b32 s7, v59, 22 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_lshl_b32 s23, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: v_readlane_b32 s7, v59, 20 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_lshl_b32 s23, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: v_readlane_b32 s7, v59, 21 ; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v59, 27 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v59, 26 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v59, 24 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v59, 25 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: s_and_b32 s61, s61, 0xff -; SI-NEXT: s_lshl_b32 s61, s61, 16 -; SI-NEXT: s_addk_i32 s23, 0x300 -; SI-NEXT: s_or_b32 s60, s60, s61 -; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 31 -; SI-NEXT: s_or_b32 s23, s60, s23 -; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 30 -; SI-NEXT: s_and_b32 s60, s60, 0xff -; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 28 -; SI-NEXT: s_or_b32 s60, s61, s60 -; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 29 -; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 47 -; SI-NEXT: s_and_b32 s62, s62, 0xff +; SI-NEXT: v_readlane_b32 s7, v58, 35 ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 46 -; SI-NEXT: s_lshl_b32 s62, s62, 16 -; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: v_readlane_b32 s7, v58, 34 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 32 -; SI-NEXT: s_or_b32 s61, s61, s62 -; SI-NEXT: s_and_b32 s60, s60, 0xffff +; SI-NEXT: v_readlane_b32 s7, v59, 58 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 33 -; SI-NEXT: s_or_b32 s60, s61, s60 -; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: v_readlane_b32 s7, v58, 30 +; SI-NEXT: s_add_i32 s46, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v58, 33 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: v_readlane_b32 s7, v58, 32 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 43 -; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 42 -; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 41 -; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 38 -; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 35 -; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 34 -; SI-NEXT: s_and_b32 s45, s45, 0xff -; SI-NEXT: s_add_i32 s14, s7, 3 -; SI-NEXT: s_or_b32 s42, s42, s45 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: v_readlane_b32 s7, v58, 27 +; SI-NEXT: s_lshl_b32 s43, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v58, 28 ; SI-NEXT: s_and_b32 s57, s57, 0xff -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: v_readlane_b32 s7, v58, 31 ; SI-NEXT: s_or_b32 s56, s56, s57 -; SI-NEXT: s_or_b32 s57, s14, s15 -; SI-NEXT: s_and_b32 s14, s44, 0xff +; SI-NEXT: s_and_b32 s44, s44, 0xff +; SI-NEXT: s_add_i32 s47, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v58, 29 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_lshl_b32 s45, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v58, 21 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_and_b32 s44, s56, 0xffff +; SI-NEXT: s_lshl_b32 s13, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v58, 23 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_and_b32 s44, s47, 0xff +; SI-NEXT: s_add_i32 s15, s7, 3 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_addk_i32 s44, 0x300 +; SI-NEXT: v_readlane_b32 s7, v58, 26 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_and_b32 s15, s44, 0xffff +; SI-NEXT: s_add_i32 s42, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v58, 24 +; SI-NEXT: s_or_b32 s44, s13, s15 ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 37 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 36 -; SI-NEXT: s_add_i32 s40, s7, 3 -; SI-NEXT: s_and_b32 s61, s61, 0xff -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: s_lshl_b32 s61, s61, 16 -; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_or_b32 s59, s59, s61 -; SI-NEXT: s_and_b32 s58, s58, 0xffff -; SI-NEXT: s_or_b32 s15, s15, s40 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s58, s59, s58 -; SI-NEXT: s_or_b32 s59, s15, s14 -; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v42, 31 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 28 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: v_readlane_b32 s7, v58, 19 +; SI-NEXT: s_lshl_b32 s11, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v58, 20 +; SI-NEXT: s_and_b32 s13, s42, 0xff +; SI-NEXT: s_add_i32 s12, s7, 3 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: v_readlane_b32 s7, v58, 25 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_add_i32 s41, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v58, 22 +; SI-NEXT: s_or_b32 s45, s11, s12 +; SI-NEXT: s_lshl_b32 s12, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v58, 17 +; SI-NEXT: s_lshl_b32 s9, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v58, 18 +; SI-NEXT: s_and_b32 s11, s41, 0xff +; SI-NEXT: s_add_i32 s10, s7, 3 +; SI-NEXT: s_or_b32 s47, s12, s11 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_addk_i32 s47, 0x300 +; SI-NEXT: s_or_b32 s56, s9, s10 +; SI-NEXT: s_add_i32 s9, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s47, 0xffff +; SI-NEXT: s_add_i32 s42, s43, 0x3000000 +; SI-NEXT: s_add_i32 s43, s44, 0x3000000 +; SI-NEXT: s_add_i32 s44, s45, 0x3000000 +; SI-NEXT: s_or_b32 s45, s56, s6 +; SI-NEXT: v_readlane_b32 s6, v58, 16 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v58, 14 +; SI-NEXT: s_and_b32 s6, s8, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: v_readlane_b32 s7, v58, 9 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 24 -; SI-NEXT: s_add_i32 s24, s7, 3 -; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: v_readlane_b32 s7, v58, 10 +; SI-NEXT: s_add_i32 s10, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s7, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s16 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v42, 32 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 26 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 16 -; SI-NEXT: s_or_b32 s6, s11, s6 -; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 18 -; SI-NEXT: s_add_i32 s12, s7, 3 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v42, 33 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 30 -; SI-NEXT: s_or_b32 s6, s11, s6 -; SI-NEXT: s_and_b32 s11, s13, 0xff -; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 22 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 23 -; SI-NEXT: s_add_i32 s25, s7, 3 -; SI-NEXT: s_and_b32 s12, s25, 0xff -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v42, 29 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 27 -; SI-NEXT: v_readlane_b32 s11, v42, 20 -; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: v_readlane_b32 s6, v58, 15 +; SI-NEXT: s_add_i32 s7, s6, 3 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v58, 13 +; SI-NEXT: v_readlane_b32 s16, v58, 8 ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v42, 21 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v42, 19 -; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v42, 17 -; SI-NEXT: v_readlane_b32 s12, v42, 14 -; SI-NEXT: s_and_b32 s9, s21, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 8 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 15 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_lshl_b32 s11, s11, 24 -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 13 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v42, 12 -; SI-NEXT: v_readlane_b32 s13, v42, 10 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 11 -; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_lshl_b32 s12, s12, 24 -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 9 -; SI-NEXT: s_add_i32 s15, s16, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v42, 8 -; SI-NEXT: v_readlane_b32 s16, v42, 6 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 7 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_lshl_b32 s13, s13, 24 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v58, 7 +; SI-NEXT: s_and_b32 s16, s20, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_or_b32 s13, s13, s16 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 5 -; SI-NEXT: s_add_i32 s40, s17, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v42, 4 -; SI-NEXT: v_readlane_b32 s17, v42, 2 -; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v58, 12 +; SI-NEXT: s_add_i32 s11, s17, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_readlane_b32 s16, v58, 11 +; SI-NEXT: v_readlane_b32 s17, v58, 5 +; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: s_or_b32 s7, s16, s7 +; SI-NEXT: v_readlane_b32 s16, v58, 4 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 1 -; SI-NEXT: s_add_i32 s41, s18, 0x3000000 +; SI-NEXT: s_or_b32 s7, s16, s7 +; SI-NEXT: v_readlane_b32 s16, v58, 6 +; SI-NEXT: s_add_i32 s12, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v58, 3 +; SI-NEXT: v_readlane_b32 s18, v58, 1 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v58, 2 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 61 -; SI-NEXT: s_add_i32 s42, s19, 0x3000000 +; SI-NEXT: s_or_b32 s18, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v58, 0 +; SI-NEXT: s_add_i32 s13, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 60 -; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: v_readlane_b32 s17, v59, 63 +; SI-NEXT: v_readlane_b32 s19, v59, 61 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v59, 62 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 57 -; SI-NEXT: s_add_i32 s43, s20, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v43, 56 -; SI-NEXT: v_readlane_b32 s20, v43, 54 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 55 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_lshl_b32 s19, s19, 24 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_or_b32 s19, s19, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: s_or_b32 s17, s17, s19 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s20, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v59, 60 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v59, 59 +; SI-NEXT: v_readlane_b32 s19, v59, 29 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v43, 52 -; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v59, 28 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s19 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s14, s21, 0x3000000 +; SI-NEXT: s_add_i32 s19, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s98, 3 +; SI-NEXT: v_readlane_b32 s21, v59, 31 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s97, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v59, 30 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_addk_i32 s19, 0x300 -; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 49 -; SI-NEXT: s_add_i32 s44, s22, 0x3000000 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v43, 48 -; SI-NEXT: v_readlane_b32 s22, v43, 46 -; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_or_b32 s46, s59, s46 +; SI-NEXT: s_add_i32 s59, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s81, 3 +; SI-NEXT: v_readlane_b32 s21, v59, 33 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s79, 8 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v59, 32 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v59, 37 +; SI-NEXT: s_add_i32 s15, s22, 0x3000000 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s21, v59, 36 +; SI-NEXT: v_readlane_b32 s22, v59, 35 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: s_or_b32 s17, s21, s17 +; SI-NEXT: v_readlane_b32 s21, v59, 34 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s21, s21, s22 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 43 -; SI-NEXT: s_add_i32 s45, s23, 0x3000000 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v43, 42 -; SI-NEXT: v_readlane_b32 s23, v43, 44 -; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_and_b32 s58, s58, 0xffff +; SI-NEXT: s_or_b32 s17, s21, s17 +; SI-NEXT: s_or_b32 s46, s46, s58 +; SI-NEXT: s_add_i32 s58, s17, 0x3000000 +; SI-NEXT: v_readlane_b32 s17, v59, 41 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s21, v59, 40 +; SI-NEXT: v_readlane_b32 s22, v59, 39 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s61, s61, 0xff +; SI-NEXT: s_or_b32 s17, s21, s17 +; SI-NEXT: v_readlane_b32 s21, v59, 38 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_lshl_b32 s21, s21, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_or_b32 s17, s21, s17 +; SI-NEXT: v_readlane_b32 s21, v59, 45 +; SI-NEXT: s_add_i32 s40, s23, 0x3000000 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s22, v59, 44 +; SI-NEXT: v_readlane_b32 s23, v59, 43 +; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_readlane_b32 s22, v59, 42 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s21, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_and_b32 s21, s21, 0xffff ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 41 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v43, 40 -; SI-NEXT: v_readlane_b32 s24, v43, 38 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s23, s23, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v43, 39 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_add_i32 s56, s21, 0x3000000 +; SI-NEXT: s_add_i32 s21, s50, 3 +; SI-NEXT: v_readlane_b32 s23, v59, 47 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s22, s39, 8 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_readlane_b32 s22, v59, 46 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s22, s22, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_add_i32 s22, s21, 0x3000000 +; SI-NEXT: s_add_i32 s21, s38, 3 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s23, s37, 8 +; SI-NEXT: s_add_i32 s28, s35, 3 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s23, s34, 24 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s23, s23, s28 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_add_i32 s57, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v59, 57 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s23, s29, 8 +; SI-NEXT: s_add_i32 s28, s31, 3 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: v_readlane_b32 s23, v59, 48 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 -; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s23, s23, s24 -; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 37 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v43, 36 -; SI-NEXT: v_readlane_b32 s25, v43, 34 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 35 -; SI-NEXT: s_and_b32 s25, s25, 0xff -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_lshl_b32 s24, s24, 24 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s23, s23, s28 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_add_i32 s23, s24, 3 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s28, s25, 8 +; SI-NEXT: s_add_i32 s29, s26, 3 +; SI-NEXT: s_or_b32 s23, s28, s23 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_lshl_b32 s28, s27, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_or_b32 s23, s28, s23 +; SI-NEXT: s_add_i32 s28, s23, 0x3000000 +; SI-NEXT: v_readlane_b32 s23, v59, 56 +; SI-NEXT: s_add_i32 s41, s46, 0x3000000 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_readlane_b32 s29, v59, 55 +; SI-NEXT: v_readlane_b32 s46, v59, 53 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_or_b32 s23, s29, s23 +; SI-NEXT: v_readlane_b32 s29, v59, 54 ; SI-NEXT: s_and_b32 s46, s46, 0xff -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 3 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_lshl_b32 s29, s29, 24 ; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v43, 2 -; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_lshl_b32 s25, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v43, 0 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_lshl_b32 s25, s25, 24 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_or_b32 s23, s29, s23 +; SI-NEXT: v_readlane_b32 s29, v59, 52 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_readlane_b32 s46, v59, 51 +; SI-NEXT: v_readlane_b32 s47, v59, 49 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 8 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: v_readlane_b32 s46, v59, 50 +; SI-NEXT: s_and_b32 s47, s47, 0xff +; SI-NEXT: s_addk_i32 s29, 0x300 +; SI-NEXT: s_lshl_b32 s46, s46, 24 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_and_b32 s29, s29, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_add_i32 s29, s29, 0x3000000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s46, s60, 0x3000000 -; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s16, s16, 0x3000000 +; SI-NEXT: s_add_i32 s45, s45, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 -; SI-NEXT: s_add_i32 s20, s20, 0x3000000 -; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 -; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_add_i32 s16, s16, 0x3000000 +; SI-NEXT: s_add_i32 s17, s17, 0x3000000 +; SI-NEXT: s_add_i32 s21, s21, 0x3000000 +; SI-NEXT: s_add_i32 s23, s23, 0x3000000 +; SI-NEXT: s_lshl_b32 s24, s29, 16 +; SI-NEXT: s_and_b32 s60, s29, 0xffff0000 +; SI-NEXT: v_writelane_b32 v58, s24, 36 +; SI-NEXT: s_and_b32 s24, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s63, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s28, 16 +; SI-NEXT: s_and_b32 s72, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 -; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_and_b32 s74, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s22, 16 +; SI-NEXT: s_and_b32 s75, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s56, 16 +; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s78, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s58, 16 +; SI-NEXT: s_and_b32 s88, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s90, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s59, 16 +; SI-NEXT: s_and_b32 s91, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s93, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s20, 16 +; SI-NEXT: s_and_b32 s36, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s22, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s7, 16 +; SI-NEXT: s_and_b32 s55, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s47, 16 -; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s45, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s43, 16 -; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s87, s42, 16 -; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s15, 16 -; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: s_and_b32 s66, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s8, 16 +; SI-NEXT: s_and_b32 s68, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s45, 16 +; SI-NEXT: s_and_b32 s70, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s44, 16 +; SI-NEXT: s_and_b32 s30, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s43, 16 +; SI-NEXT: s_and_b32 s94, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s95, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s41, 16 +; SI-NEXT: s_and_b32 s51, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s40, 16 +; SI-NEXT: s_and_b32 s52, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s15, 16 +; SI-NEXT: s_and_b32 s53, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s14, 16 +; SI-NEXT: s_and_b32 s71, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s13, 16 +; SI-NEXT: s_and_b32 s83, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s12, 16 +; SI-NEXT: s_and_b32 s96, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s11, 16 +; SI-NEXT: s_and_b32 s49, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s10, 16 +; SI-NEXT: s_and_b32 s62, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s46, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s5, 16 +; SI-NEXT: s_and_b32 s82, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s4, 16 +; SI-NEXT: v_writelane_b32 v58, s24, 37 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_readlane_b32 s4, v58, 36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s4, v58, 37 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s23 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s77 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s65 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s69 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s15 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s80 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s99, v41, 35 -; SI-NEXT: v_readlane_b32 s98, v41, 34 -; SI-NEXT: v_readlane_b32 s97, v41, 33 -; SI-NEXT: v_readlane_b32 s96, v41, 32 -; SI-NEXT: v_readlane_b32 s87, v41, 31 -; SI-NEXT: v_readlane_b32 s86, v41, 30 -; SI-NEXT: v_readlane_b32 s85, v41, 29 -; SI-NEXT: v_readlane_b32 s84, v41, 28 -; SI-NEXT: v_readlane_b32 s83, v41, 27 -; SI-NEXT: v_readlane_b32 s82, v41, 26 -; SI-NEXT: v_readlane_b32 s81, v41, 25 -; SI-NEXT: v_readlane_b32 s80, v41, 24 -; SI-NEXT: v_readlane_b32 s71, v41, 23 -; SI-NEXT: v_readlane_b32 s70, v41, 22 -; SI-NEXT: v_readlane_b32 s69, v41, 21 -; SI-NEXT: v_readlane_b32 s68, v41, 20 -; SI-NEXT: v_readlane_b32 s67, v41, 19 -; SI-NEXT: v_readlane_b32 s66, v41, 18 -; SI-NEXT: v_readlane_b32 s65, v41, 17 -; SI-NEXT: v_readlane_b32 s64, v41, 16 -; SI-NEXT: v_readlane_b32 s55, v41, 15 -; SI-NEXT: v_readlane_b32 s54, v41, 14 -; SI-NEXT: v_readlane_b32 s53, v41, 13 -; SI-NEXT: v_readlane_b32 s52, v41, 12 -; SI-NEXT: v_readlane_b32 s51, v41, 11 -; SI-NEXT: v_readlane_b32 s50, v41, 10 -; SI-NEXT: v_readlane_b32 s49, v41, 9 -; SI-NEXT: v_readlane_b32 s48, v41, 8 -; SI-NEXT: v_readlane_b32 s39, v41, 7 -; SI-NEXT: v_readlane_b32 s38, v41, 6 -; SI-NEXT: v_readlane_b32 s37, v41, 5 -; SI-NEXT: v_readlane_b32 s36, v41, 4 -; SI-NEXT: v_readlane_b32 s35, v41, 3 -; SI-NEXT: v_readlane_b32 s34, v41, 2 -; SI-NEXT: v_readlane_b32 s31, v41, 1 -; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v57, 35 +; SI-NEXT: v_readlane_b32 s98, v57, 34 +; SI-NEXT: v_readlane_b32 s97, v57, 33 +; SI-NEXT: v_readlane_b32 s96, v57, 32 +; SI-NEXT: v_readlane_b32 s87, v57, 31 +; SI-NEXT: v_readlane_b32 s86, v57, 30 +; SI-NEXT: v_readlane_b32 s85, v57, 29 +; SI-NEXT: v_readlane_b32 s84, v57, 28 +; SI-NEXT: v_readlane_b32 s83, v57, 27 +; SI-NEXT: v_readlane_b32 s82, v57, 26 +; SI-NEXT: v_readlane_b32 s81, v57, 25 +; SI-NEXT: v_readlane_b32 s80, v57, 24 +; SI-NEXT: v_readlane_b32 s71, v57, 23 +; SI-NEXT: v_readlane_b32 s70, v57, 22 +; SI-NEXT: v_readlane_b32 s69, v57, 21 +; SI-NEXT: v_readlane_b32 s68, v57, 20 +; SI-NEXT: v_readlane_b32 s67, v57, 19 +; SI-NEXT: v_readlane_b32 s66, v57, 18 +; SI-NEXT: v_readlane_b32 s65, v57, 17 +; SI-NEXT: v_readlane_b32 s64, v57, 16 +; SI-NEXT: v_readlane_b32 s55, v57, 15 +; SI-NEXT: v_readlane_b32 s54, v57, 14 +; SI-NEXT: v_readlane_b32 s53, v57, 13 +; SI-NEXT: v_readlane_b32 s52, v57, 12 +; SI-NEXT: v_readlane_b32 s51, v57, 11 +; SI-NEXT: v_readlane_b32 s50, v57, 10 +; SI-NEXT: v_readlane_b32 s49, v57, 9 +; SI-NEXT: v_readlane_b32 s48, v57, 8 +; SI-NEXT: v_readlane_b32 s39, v57, 7 +; SI-NEXT: v_readlane_b32 s38, v57, 6 +; SI-NEXT: v_readlane_b32 s37, v57, 5 +; SI-NEXT: v_readlane_b32 s36, v57, 4 +; SI-NEXT: v_readlane_b32 s35, v57, 3 +; SI-NEXT: v_readlane_b32 s34, v57, 2 +; SI-NEXT: v_readlane_b32 s31, v57, 1 +; SI-NEXT: v_readlane_b32 s30, v57, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 -; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: v_readlane_b32 s41, v43, 13 -; SI-NEXT: v_readlane_b32 s44, v43, 5 -; SI-NEXT: v_readlane_b32 s9, v43, 11 -; SI-NEXT: v_readlane_b32 s14, v43, 12 -; SI-NEXT: v_readlane_b32 s81, v43, 9 -; SI-NEXT: v_readlane_b32 s10, v43, 16 -; SI-NEXT: v_readlane_b32 s12, v43, 4 -; SI-NEXT: v_readlane_b32 s96, v43, 7 -; SI-NEXT: v_readlane_b32 s82, v43, 8 -; SI-NEXT: v_readlane_b32 s71, v43, 15 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -152832,29 +153389,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36 @@ -152886,16 +153443,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -152951,7 +153508,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 @@ -152968,6 +153525,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -152976,115 +153536,139 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -153092,40 +153676,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -153266,112 +153850,112 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -153385,7 +153969,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -153394,7 +153978,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -153492,32 +154076,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 @@ -153656,31 +154240,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 @@ -153712,16 +154296,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -153777,7 +154361,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 @@ -153794,6 +154378,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff @@ -153802,115 +154389,139 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -153918,40 +154529,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -153959,7 +154570,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 @@ -154087,117 +154698,117 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v179 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v128 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -154211,7 +154822,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -154220,7 +154831,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -154318,33 +154929,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 @@ -160785,988 +161396,995 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v91, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v92, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v93, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v90, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[17:18] ; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s2, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v68, 0x40c00000, v35 :: v_dual_lshlrev_b32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v20 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v96, v68, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v68 +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v67, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v33, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v48, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v37, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; GFX11-FAKE16-NEXT: v_perm_b32 v69, v77, v17, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v69 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v69 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v52, v18, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v19, v19, v34, 0x7fff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v18, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_perm_b32 v68, v34, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v68 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v34, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v65, v19, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v39, v36, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v65 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v20, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v66, v18, v19, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v64, v35, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v65 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65] -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v22, v48, v37, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v35, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v71, v21, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v21 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v100, 0x40c00000, v34 :: v_dual_add_f32 v99, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v70, v36, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v23 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v99 +; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v99, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v20, v37 :: v_dual_and_b32 v20, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v71, v35, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v24, v49, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v48, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v24, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_perm_b32 v81, v23, v22, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v131, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v34 :: v_dual_cndmask_b32 v45, v21, v36 +; GFX11-FAKE16-NEXT: v_bfe_u32 v133, v131, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v131 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v24 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v80, v37, v36, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v26, v50, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v37, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_perm_b32 v39, v45, v20, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 16, v45 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v21, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v80 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v82, v38, v37, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v39, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v28, v51, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v38, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v83, v25, v24, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v28, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v22, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v23, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v23, 16, v25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_cndmask_b32 v181, v37, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v135, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v85, v181, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v100 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v100, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v146, v135, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v25, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v135 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v135, v135 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v30 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v84, v39, v38, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v48, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v164, v34, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v181 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v30 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v28, v28, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v27, 16, 1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v28, v28, v51 :: v_dual_lshlrev_b32 v51, 16, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v83 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v86, v48, v39, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v49, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v53, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v147, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 0x40c00000, v29 :: v_dual_add_f32 v34, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v149, v49, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v129, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v57, v129, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v165, v147, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v147 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v30, v55 :: v_dual_lshlrev_b32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_perm_b32 v85, v27, v26, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_perm_b32 v87, v149, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v145, v51, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 ; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v149 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v65 :: v_dual_lshlrev_b32 v65, 16, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v31 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v128, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v65 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v37 :: v_dual_add_f32 v167, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v61, v55, v69, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v65 :: v_dual_lshlrev_b32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v178, 0x40c00000, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v65 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v76, v55, v69, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v176, v167, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v166, 0x400000, v167 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v49 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff ; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_bfe_u32 v179, v178, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v180, 0x400000, v178 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v182, v51, v65 :: v_dual_lshlrev_b32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v51 :: v_dual_lshlrev_b32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v160, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v91, v55, v69, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v37, v164, v24, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v34, v145, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v151, v160, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v163, 0x400000, v160 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v150, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v134, v128, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v148, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v49, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v65 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v128 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v162, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v161, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v89, v51, v69 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v183, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v177, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v55, 0x7fff ; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v40, v183, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v177, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v42, v49, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v65, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v177, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v41, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v63, v49, v51, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v41, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v55, v177, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v65 +; GFX11-FAKE16-NEXT: v_bfe_u32 v130, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v117, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v58, v49, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v177, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v49, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v55, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v14 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v16, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v43, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v41, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v64, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v70, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v65, v177, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v70 +; GFX11-FAKE16-NEXT: v_perm_b32 v81, v91, v182, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v177, v49, v43, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v44, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v50, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v43, 0x400000, v183 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_cndmask_b32 v54, v16, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v49, v177, v14, 0x7060302 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v94, v12, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v96, v68, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_lshlrev_b32 v16, 16, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v96, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v12, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v80, v67, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v16 :: v_dual_lshlrev_b32 v16, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v95, v12, v98, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v116, v100, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 -; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] -; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v77, v12, v114 :: v_dual_and_b32 v12, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v13, v112, v99, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX11-FAKE16-NEXT: v_perm_b32 v51, v41, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v114, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v102 :: v_dual_add_f32 v102, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v133, v131, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v131, v131 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v176, v167, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v131, v114, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v129, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v114, v114 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v74, v12, v118, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v146, v135, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v26, v179, v178, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v133, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v73, v12, v144, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v165, v147, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v147, v147 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v102, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v102 +; GFX11-FAKE16-NEXT: v_bfe_u32 v46, v44, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v60, v12, v32, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v167, v167 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v79, v6, v166, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v178, v178 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v47, 0x400000, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v92, v26, v180, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v26, v40, v183, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v183, v183 +; GFX11-FAKE16-NEXT: v_perm_b32 v83, v76, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v61, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v69, v89, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v65, v63, v42, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v180, v26, v43, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v26, v53, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v131, v114, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v50, v50 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v98, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v55, v58, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v49 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v165, v53, v118, s1 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v129, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v167, v26, v80, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v67, v67 +; GFX11-FAKE16-NEXT: v_perm_b32 v67, v95, v48, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, v53, v133, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v178, v50, v100, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v102, v102 +; GFX11-FAKE16-NEXT: v_perm_b32 v53, v94, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v48, v15, v165, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v15, v99, v102, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v54 +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v82, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, v15, v112, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[48:49] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] +; GFX11-FAKE16-NEXT: v_perm_b32 v50, v15, v178, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, v54, v116, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v15, v46, v44, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v44, v44 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v42 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v182 +; GFX11-FAKE16-NEXT: v_perm_b32 v54, v11, v167, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v96, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v183, v15, v47, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v81 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, v11, v68, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v64, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[50:51] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v64, v9, v183, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v151, v160, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v7, v11, v33, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v160, v160 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v167 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v39 +; GFX11-FAKE16-NEXT: v_perm_b32 v68, v7, v180, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v43, v9, v163, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v162, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[66:67] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v55 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, v7, v161, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v150, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v54 +; GFX11-FAKE16-NEXT: v_perm_b32 v80, v5, v43, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v134, v128, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v148, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v128, v128 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v80 +; GFX11-FAKE16-NEXT: v_perm_b32 v82, v3, v92, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v46, v5, v132, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v130, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v180 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[80:81] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v82 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v3, v119, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v117, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v31, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[82:83] +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v1, v46, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 24, v65 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v3, v115, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v113, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v29, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v64 +; GFX11-FAKE16-NEXT: v_perm_b32 v33, v1, v79, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v103, v70, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v3, v101, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v70, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 24, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v93, v1, v86, s0 +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v84, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v27, v27 +; GFX11-FAKE16-NEXT: v_perm_b32 v86, v3, v60, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v36, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v70, v13, v77, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v38, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v23, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v93 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v178 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v39 +; GFX11-FAKE16-NEXT: v_perm_b32 v36, v1, v93, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v3, v25, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v57, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v93, 16, v61 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] +; GFX11-FAKE16-NEXT: v_perm_b32 v84, v1, v73, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v38, v3, v74, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[84:85] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v68 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v68 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v60 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v73 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[54:55] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[68:69] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[96:97] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v90, 8, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 8, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v165 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v43 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v92 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v79 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v74 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v77 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 8, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v41 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v58 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 16, v89 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v91, 16, v91 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v92, 16, v76 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v145 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v164 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 16, v95 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v94 ; GFX11-FAKE16-NEXT: .LBB90_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v90 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v88 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v78 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v69, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v69, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v69, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v69, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v68, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v68, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v70, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v116 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v102 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v51 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -161788,30 +162406,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v79 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v73 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v76 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v47 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -161833,29 +162451,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v180 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v162 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v150 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v146 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -161878,31 +162496,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v128 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v119 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -161930,29 +162548,39 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:136 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -175322,13 +175950,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v61, s29, 0 ; SI-NEXT: v_writelane_b32 v61, s28, 1 ; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 -; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v61, s26, 3 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -175352,6 +175980,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s69, 21 ; SI-NEXT: v_writelane_b32 v63, s70, 22 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: s_mov_b32 s60, s16 +; SI-NEXT: s_mov_b32 s61, s23 ; SI-NEXT: v_writelane_b32 v63, s80, 24 ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 @@ -175362,59 +175992,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: s_mov_b32 s35, s21 +; SI-NEXT: s_mov_b32 s26, s25 +; SI-NEXT: s_mov_b32 s93, s19 +; SI-NEXT: v_readfirstlane_b32 s30, v1 +; SI-NEXT: v_readfirstlane_b32 s56, v28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 -; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 -; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 -; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 -; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 -; SI-NEXT: s_mov_b32 s60, s20 -; SI-NEXT: s_mov_b32 s63, s24 -; SI-NEXT: v_readfirstlane_b32 s95, v3 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s24, v9 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v16 -; SI-NEXT: v_readfirstlane_b32 s79, v15 -; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v62, s56, 0 +; SI-NEXT: v_readfirstlane_b32 s59, v30 +; SI-NEXT: v_writelane_b32 v62, s58, 1 +; SI-NEXT: v_readfirstlane_b32 s27, v29 +; SI-NEXT: v_writelane_b32 v62, s59, 2 +; SI-NEXT: v_writelane_b32 v62, s27, 3 +; SI-NEXT: s_mov_b32 s63, s18 +; SI-NEXT: s_mov_b32 s62, s22 +; SI-NEXT: v_readfirstlane_b32 s37, v3 +; SI-NEXT: v_readfirstlane_b32 s65, v5 +; SI-NEXT: v_readfirstlane_b32 s95, v9 +; SI-NEXT: v_readfirstlane_b32 s89, v11 +; SI-NEXT: v_readfirstlane_b32 s34, v13 +; SI-NEXT: v_readfirstlane_b32 s28, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v18 ; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: v_readfirstlane_b32 s47, v21 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_readfirstlane_b32 s82, v23 +; SI-NEXT: v_readfirstlane_b32 s84, v26 +; SI-NEXT: v_readfirstlane_b32 s7, v25 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_readfirstlane_b32 s18, v10 +; SI-NEXT: v_readfirstlane_b32 s79, v8 +; SI-NEXT: v_readfirstlane_b32 s92, v7 +; SI-NEXT: v_readfirstlane_b32 s87, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 -; SI-NEXT: v_readfirstlane_b32 s98, v10 -; SI-NEXT: v_readfirstlane_b32 s90, v8 -; SI-NEXT: v_readfirstlane_b32 s88, v7 -; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 -; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_readfirstlane_b32 s90, v4 +; SI-NEXT: v_readfirstlane_b32 s97, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -175432,372 +176056,373 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s94, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 +; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s11, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s55, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: v_writelane_b32 v61, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s17, 36 +; SI-NEXT: v_writelane_b32 v61, s60, 37 +; SI-NEXT: v_writelane_b32 v61, s93, 38 +; SI-NEXT: v_writelane_b32 v61, s63, 39 +; SI-NEXT: v_writelane_b32 v61, s35, 40 +; SI-NEXT: v_writelane_b32 v61, s20, 41 +; SI-NEXT: v_writelane_b32 v61, s61, 42 +; SI-NEXT: v_writelane_b32 v61, s62, 43 +; SI-NEXT: v_writelane_b32 v61, s26, 44 +; SI-NEXT: v_writelane_b32 v61, s24, 45 +; SI-NEXT: v_writelane_b32 v61, s30, 46 +; SI-NEXT: v_writelane_b32 v61, s37, 47 +; SI-NEXT: v_writelane_b32 v61, s65, 48 +; SI-NEXT: v_writelane_b32 v61, s95, 49 +; SI-NEXT: v_writelane_b32 v61, s89, 50 +; SI-NEXT: v_writelane_b32 v61, s34, 51 +; SI-NEXT: v_writelane_b32 v61, s28, 52 +; SI-NEXT: v_writelane_b32 v61, s8, 53 +; SI-NEXT: v_writelane_b32 v61, s12, 54 +; SI-NEXT: v_writelane_b32 v61, s15, 55 +; SI-NEXT: v_writelane_b32 v61, s41, 56 +; SI-NEXT: v_writelane_b32 v61, s42, 57 +; SI-NEXT: v_writelane_b32 v61, s44, 58 +; SI-NEXT: v_writelane_b32 v61, s47, 59 +; SI-NEXT: v_writelane_b32 v61, s74, 60 +; SI-NEXT: v_writelane_b32 v61, s82, 61 +; SI-NEXT: v_writelane_b32 v61, s84, 62 +; SI-NEXT: v_writelane_b32 v61, s7, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s9, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s13, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: v_readfirstlane_b32 s14, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s40, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: v_readfirstlane_b32 s43, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s46, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s57, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s16, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s76, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 -; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 -; SI-NEXT: v_writelane_b32 v61, s18, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 -; SI-NEXT: v_writelane_b32 v61, s22, 44 -; SI-NEXT: v_writelane_b32 v61, s62, 45 -; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s38, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s68, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s67, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s53, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s98, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s50, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s78, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s99, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s73, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s96, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s31, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s36, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s54, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s25, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s19, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 +; SI-NEXT: v_writelane_b32 v62, s29, 4 +; SI-NEXT: v_writelane_b32 v62, s25, 5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s88, v31 +; SI-NEXT: v_writelane_b32 v62, s88, 6 +; SI-NEXT: v_writelane_b32 v62, s21, 7 +; SI-NEXT: v_writelane_b32 v62, s19, 8 +; SI-NEXT: v_writelane_b32 v62, s54, 9 +; SI-NEXT: v_writelane_b32 v62, s49, 10 +; SI-NEXT: v_writelane_b32 v62, s52, 11 +; SI-NEXT: v_writelane_b32 v62, s31, 12 +; SI-NEXT: v_writelane_b32 v62, s64, 13 +; SI-NEXT: v_writelane_b32 v62, s66, 14 +; SI-NEXT: v_writelane_b32 v62, s36, 15 +; SI-NEXT: v_writelane_b32 v62, s80, 16 +; SI-NEXT: v_writelane_b32 v62, s70, 17 +; SI-NEXT: v_writelane_b32 v62, s67, 18 +; SI-NEXT: v_writelane_b32 v62, s68, 19 +; SI-NEXT: v_writelane_b32 v62, s39, 20 +; SI-NEXT: v_writelane_b32 v62, s83, 21 +; SI-NEXT: v_writelane_b32 v62, s81, 22 +; SI-NEXT: v_writelane_b32 v62, s85, 23 +; SI-NEXT: v_writelane_b32 v62, s86, 24 +; SI-NEXT: v_writelane_b32 v62, s53, 25 +; SI-NEXT: v_writelane_b32 v62, s50, 26 +; SI-NEXT: v_writelane_b32 v62, s99, 27 +; SI-NEXT: v_writelane_b32 v62, s98, 28 +; SI-NEXT: v_writelane_b32 v62, s6, 29 +; SI-NEXT: v_writelane_b32 v62, s96, 30 +; SI-NEXT: v_writelane_b32 v62, s78, 31 +; SI-NEXT: v_writelane_b32 v62, s72, 32 +; SI-NEXT: v_writelane_b32 v62, s73, 33 +; SI-NEXT: v_writelane_b32 v62, s75, 34 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_and_b32 s4, s62, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 3 ; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill @@ -175809,621 +176434,615 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s99, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s65, 0xff +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s18, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_and_b32 s4, s89, 0xff +; SI-NEXT: s_lshl_b32 s5, s22, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_and_b32 s4, s82, 0xff ; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s4, s58, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s31, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s81, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 ; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_and_b32 s4, s64, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s8, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s96, v61, 35 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s69, v61, 35 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_and_b32 s4, s55, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s26, v61, 34 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s6, v61, 33 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_and_b32 s4, s91, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_readlane_b32 s98, v61, 32 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s98, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s99, v61, 31 +; SI-NEXT: v_readlane_b32 s50, v61, 30 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s99, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_readlane_b32 s53, v61, 29 +; SI-NEXT: v_readlane_b32 s86, v61, 28 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_and_b32 s4, s53, 0xff ; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_readlane_b32 s85, v61, 27 +; SI-NEXT: v_readlane_b32 s81, v61, 26 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s81, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_readlane_b32 s83, v61, 25 +; SI-NEXT: v_readlane_b32 s39, v61, 24 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_and_b32 s4, s82, 0xff -; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_readlane_b32 s68, v61, 23 +; SI-NEXT: v_readlane_b32 s67, v61, 22 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_readlane_b32 s70, v61, 21 +; SI-NEXT: v_readlane_b32 s80, v61, 20 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s80, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_readlane_b32 s36, v61, 19 +; SI-NEXT: v_readlane_b32 s66, v61, 18 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_and_b32 s4, s65, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s66, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s64, v61, 17 +; SI-NEXT: v_readlane_b32 s31, v61, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_and_b32 s4, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s31, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s52, v61, 15 +; SI-NEXT: v_readlane_b32 s49, v61, 14 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_and_b32 s4, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s54, v61, 13 +; SI-NEXT: v_readlane_b32 s93, v61, 12 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_and_b32 s4, s54, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_mov_b32 s19, s38 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s38, v61, 11 +; SI-NEXT: v_readlane_b32 s35, v61, 10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_mov_b32 s21, s48 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s48, v61, 9 +; SI-NEXT: v_readlane_b32 s88, v61, 8 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s48, 0xff +; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: s_mov_b32 s25, s77 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s77, v61, 6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_and_b32 s4, s75, 0xff ; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 ; SI-NEXT: s_mov_b32 s29, s76 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 5 +; SI-NEXT: v_readlane_b32 s76, v61, 4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s78, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: s_mov_b32 s30, s97 +; SI-NEXT: s_mov_b32 s27, s23 +; SI-NEXT: s_mov_b32 s23, s16 +; SI-NEXT: s_mov_b32 s58, s57 +; SI-NEXT: s_mov_b32 s56, s46 +; SI-NEXT: s_mov_b32 s46, s43 +; SI-NEXT: s_mov_b32 s43, s40 +; SI-NEXT: s_mov_b32 s40, s14 +; SI-NEXT: s_mov_b32 s14, s13 +; SI-NEXT: s_mov_b32 s13, s9 +; SI-NEXT: s_mov_b32 s82, s71 +; SI-NEXT: s_mov_b32 s71, s69 +; SI-NEXT: s_mov_b32 s44, s45 +; SI-NEXT: s_mov_b32 s69, s55 +; SI-NEXT: s_mov_b32 s8, s11 +; SI-NEXT: s_mov_b32 s55, s51 +; SI-NEXT: s_mov_b32 s51, s91 +; SI-NEXT: s_mov_b32 s91, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_mov_b32 s97, s90 +; SI-NEXT: s_mov_b32 s37, s87 +; SI-NEXT: s_mov_b32 s90, s79 +; SI-NEXT: s_mov_b32 s65, s18 +; SI-NEXT: s_mov_b32 s95, s22 +; SI-NEXT: s_mov_b32 s89, s10 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s78, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s92, 3 +; SI-NEXT: s_add_i32 s5, s75, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8 ; SI-NEXT: s_or_b32 s5, vcc_lo, s5 -; SI-NEXT: s_add_i32 vcc_lo, s37, 3 +; SI-NEXT: s_add_i32 vcc_lo, s48, 3 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s88, 8 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_add_i32 vcc_hi, s38, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s60, s39, 8 +; SI-NEXT: s_lshl_b32 s60, s35, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_add_i32 vcc_hi, s54, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s61, s48, 8 +; SI-NEXT: s_lshl_b32 s61, s93, 8 ; SI-NEXT: s_or_b32 s61, s61, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s64, 3 +; SI-NEXT: s_add_i32 vcc_hi, s52, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s62, s52, 8 +; SI-NEXT: s_lshl_b32 s62, s49, 8 ; SI-NEXT: s_or_b32 s62, s62, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s67, 3 +; SI-NEXT: s_add_i32 vcc_hi, s64, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s63, s50, 8 -; SI-NEXT: s_or_b32 s10, s63, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s65, 3 +; SI-NEXT: s_lshl_b32 s63, s31, 8 +; SI-NEXT: s_or_b32 s63, s63, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s36, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s72, s54, 8 +; SI-NEXT: s_lshl_b32 s72, s66, 8 ; SI-NEXT: s_or_b32 s72, s72, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s71, 3 +; SI-NEXT: s_add_i32 vcc_hi, s70, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s73, s70, 8 +; SI-NEXT: s_lshl_b32 s73, s80, 8 ; SI-NEXT: s_or_b32 s73, s73, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s83, 3 +; SI-NEXT: s_add_i32 vcc_hi, s68, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s49, 8 +; SI-NEXT: s_lshl_b32 s74, s67, 8 ; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_add_i32 vcc_hi, s83, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s80, 8 +; SI-NEXT: s_lshl_b32 s75, s39, 8 ; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_add_i32 vcc_hi, s85, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_lshl_b32 s76, s81, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s51, 3 -; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_add_i32 vcc_hi, s53, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s77, s86, 8 -; SI-NEXT: s_add_i32 s89, s34, 3 -; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s78, s94, 8 -; SI-NEXT: s_add_i32 s34, s66, 3 -; SI-NEXT: s_or_b32 s77, s77, vcc_hi -; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 -; SI-NEXT: s_or_b32 s22, s78, s93 -; SI-NEXT: s_and_b32 s93, s34, 0xff -; SI-NEXT: s_lshl_b32 s92, s16, 8 -; SI-NEXT: s_add_i32 s53, s68, 3 -; SI-NEXT: s_or_b32 s89, vcc_hi, s89 -; SI-NEXT: s_or_b32 s92, s92, s93 -; SI-NEXT: s_and_b32 s93, s53, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 -; SI-NEXT: s_add_i32 s66, s69, 3 -; SI-NEXT: s_or_b32 s93, vcc_hi, s93 -; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 -; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi -; SI-NEXT: s_and_b32 s34, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s40, 8 -; SI-NEXT: s_add_i32 s69, s81, 3 -; SI-NEXT: s_or_b32 s34, s39, s34 -; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s21, 8 -; SI-NEXT: s_add_i32 s81, s7, 3 -; SI-NEXT: s_or_b32 s39, s52, s39 -; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s97, 8 -; SI-NEXT: s_add_i32 s85, s12, 3 -; SI-NEXT: s_or_b32 s52, s53, s52 -; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s11, 8 -; SI-NEXT: s_add_i32 s97, s56, 3 -; SI-NEXT: s_or_b32 s53, s64, s53 -; SI-NEXT: s_and_b32 s64, s97, 0xff +; SI-NEXT: s_or_b32 s20, s77, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s99, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s78, s50, 8 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_add_i32 s94, s51, 3 +; SI-NEXT: s_or_b32 s78, s78, vcc_hi +; SI-NEXT: s_and_b32 s91, s91, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s98, 8 +; SI-NEXT: s_and_b32 s94, s94, 0xff +; SI-NEXT: s_lshl_b32 s88, s6, 8 +; SI-NEXT: s_add_i32 s51, s55, 3 +; SI-NEXT: s_or_b32 s24, vcc_hi, s91 +; SI-NEXT: s_or_b32 s88, s88, s94 +; SI-NEXT: s_and_b32 s94, s51, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s26, 8 +; SI-NEXT: s_add_i32 s55, s69, 3 +; SI-NEXT: s_or_b32 s94, vcc_hi, s94 +; SI-NEXT: s_and_b32 vcc_hi, s55, 0xff +; SI-NEXT: s_lshl_b32 s35, s8, 8 +; SI-NEXT: s_add_i32 s69, s71, 3 +; SI-NEXT: s_or_b32 vcc_hi, s35, vcc_hi +; SI-NEXT: s_and_b32 s35, s69, 0xff +; SI-NEXT: s_lshl_b32 s48, s44, 8 +; SI-NEXT: s_add_i32 s71, s82, 3 +; SI-NEXT: s_or_b32 s35, s48, s35 +; SI-NEXT: s_and_b32 s48, s71, 0xff +; SI-NEXT: s_lshl_b32 s51, s96, 8 +; SI-NEXT: s_add_i32 s82, s14, 3 +; SI-NEXT: s_or_b32 s48, s51, s48 +; SI-NEXT: s_and_b32 s51, s82, 0xff +; SI-NEXT: s_lshl_b32 s54, s13, 8 +; SI-NEXT: s_add_i32 s84, s43, 3 +; SI-NEXT: s_or_b32 s51, s54, s51 +; SI-NEXT: s_and_b32 s54, s84, 0xff +; SI-NEXT: s_lshl_b32 s55, s40, 8 +; SI-NEXT: s_add_i32 s96, s56, 3 +; SI-NEXT: s_or_b32 s54, s55, s54 +; SI-NEXT: s_and_b32 s55, s96, 0xff ; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 -; SI-NEXT: s_or_b32 s64, s66, s64 -; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_lshl_b32 s66, s59, 8 -; SI-NEXT: s_add_i32 s25, s8, 3 -; SI-NEXT: s_or_b32 s66, s66, s21 -; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 -; SI-NEXT: s_or_b32 s67, s6, s21 -; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 -; SI-NEXT: s_or_b32 s68, s18, s6 -; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 -; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: s_add_i32 s16, s23, 3 +; SI-NEXT: s_or_b32 s55, s66, s55 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s66, s58, 8 +; SI-NEXT: s_add_i32 s23, s29, 3 +; SI-NEXT: s_or_b32 s66, s66, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s23, s27, 8 +; SI-NEXT: s_add_i32 s27, s19, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: s_or_b32 s69, s23, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xff +; SI-NEXT: s_lshl_b32 s17, s25, 8 ; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 -; SI-NEXT: s_or_b32 s70, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 16 +; SI-NEXT: s_or_b32 s71, s17, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s16, s21, 8 +; SI-NEXT: s_add_i32 s9, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 10 +; SI-NEXT: s_or_b32 s80, s16, s7 +; SI-NEXT: s_and_b32 s7, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 19 +; SI-NEXT: s_add_i32 s13, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: s_or_b32 s81, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 21 +; SI-NEXT: s_and_b32 s7, s13, 0xff +; SI-NEXT: s_add_i32 s14, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: s_or_b32 s82, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_add_i32 s40, s6, 3 ; SI-NEXT: v_readlane_b32 s6, v62, 20 -; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 -; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 -; SI-NEXT: s_and_b32 s6, s11, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_or_b32 s71, s7, s6 +; SI-NEXT: s_or_b32 s83, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 25 +; SI-NEXT: s_and_b32 s7, s40, 0xff +; SI-NEXT: s_add_i32 s43, s6, 3 ; SI-NEXT: v_readlane_b32 s6, v62, 22 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 -; SI-NEXT: s_or_b32 s17, s17, s20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 -; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_or_b32 s19, s19, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 -; SI-NEXT: s_and_b32 s6, s14, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s20, s98, 0xff -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 -; SI-NEXT: s_and_b32 s27, s27, 0xff -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 -; SI-NEXT: s_or_b32 s23, s23, s27 -; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 -; SI-NEXT: s_and_b32 s6, s41, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s24, s86, 0xff -; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_or_b32 s85, s7, s6 +; SI-NEXT: s_or_b32 s84, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 ; SI-NEXT: v_readlane_b32 s6, v62, 28 -; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: s_and_b32 s7, s43, 0xff ; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 -; SI-NEXT: s_and_b32 s6, s46, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s27, s82, 0xff -; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 24 +; SI-NEXT: s_or_b32 s86, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 ; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 -; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 -; SI-NEXT: s_and_b32 s6, s47, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s73, s65, 0xff -; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: s_and_b32 s7, s46, 0xff ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 -; SI-NEXT: s_and_b32 s6, s56, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s74, s54, 0xff -; SI-NEXT: s_lshl_b32 s75, s75, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 -; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_or_b32 s96, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 32 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_add_i32 s57, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 27 +; SI-NEXT: s_or_b32 s98, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 33 +; SI-NEXT: s_and_b32 s7, s57, 0xff ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 -; SI-NEXT: s_and_b32 s6, s58, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s75, s50, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 29 +; SI-NEXT: s_or_b32 s77, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 34 +; SI-NEXT: s_and_b32 s7, s58, 0xff +; SI-NEXT: s_add_i32 s59, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 30 +; SI-NEXT: s_or_b32 s79, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 15 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 12 +; SI-NEXT: s_lshl_b32 s29, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 14 +; SI-NEXT: s_add_i32 s28, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: s_lshl_b32 s25, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 11 +; SI-NEXT: s_add_i32 s26, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 7 +; SI-NEXT: s_lshl_b32 s21, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 8 +; SI-NEXT: s_add_i32 s22, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 5 +; SI-NEXT: s_lshl_b32 s19, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 6 +; SI-NEXT: s_add_i32 s18, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 4 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: v_readlane_b32 s18, v62, 3 +; SI-NEXT: s_or_b32 s19, s19, s22 +; SI-NEXT: s_add_i32 s99, s18, 3 +; SI-NEXT: v_readlane_b32 s22, v62, 2 +; SI-NEXT: s_and_b32 s18, s99, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 8 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_or_b32 s18, s22, s18 +; SI-NEXT: v_readlane_b32 s22, v62, 1 +; SI-NEXT: s_or_b32 s21, s21, s26 +; SI-NEXT: s_add_i32 s87, s22, 3 +; SI-NEXT: v_readlane_b32 s26, v62, 0 +; SI-NEXT: s_and_b32 s22, s87, 0xff +; SI-NEXT: s_lshl_b32 s26, s26, 8 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_or_b32 s22, s26, s22 +; SI-NEXT: v_readlane_b32 s26, v61, 63 +; SI-NEXT: s_or_b32 s25, s25, s28 +; SI-NEXT: s_add_i32 s70, s26, 3 +; SI-NEXT: v_readlane_b32 s28, v61, 62 +; SI-NEXT: s_and_b32 s26, s70, 0xff +; SI-NEXT: s_lshl_b32 s28, s28, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_or_b32 s26, s28, s26 +; SI-NEXT: v_readlane_b32 s28, v61, 61 +; SI-NEXT: s_or_b32 s8, s29, s8 +; SI-NEXT: s_add_i32 s67, s28, 3 +; SI-NEXT: v_readlane_b32 s29, v61, 60 +; SI-NEXT: s_and_b32 s28, s67, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: v_readlane_b32 s29, v61, 59 +; SI-NEXT: s_add_i32 s14, s73, 0x300 +; SI-NEXT: s_add_i32 s64, s29, 3 +; SI-NEXT: v_readlane_b32 s73, v61, 58 +; SI-NEXT: s_and_b32 s29, s64, 0xff +; SI-NEXT: s_lshl_b32 s73, s73, 8 +; SI-NEXT: s_or_b32 s29, s73, s29 +; SI-NEXT: v_readlane_b32 s73, v61, 57 +; SI-NEXT: s_add_i32 s16, s75, 0x300 +; SI-NEXT: s_add_i32 s52, s73, 3 +; SI-NEXT: v_readlane_b32 s75, v61, 56 +; SI-NEXT: s_and_b32 s73, s52, 0xff +; SI-NEXT: s_lshl_b32 s75, s75, 8 +; SI-NEXT: s_or_b32 s73, s75, s73 +; SI-NEXT: v_readlane_b32 s75, v61, 55 +; SI-NEXT: s_add_i32 s17, s76, 0x300 +; SI-NEXT: s_add_i32 s49, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 54 +; SI-NEXT: s_and_b32 s75, s49, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 ; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 -; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 -; SI-NEXT: s_add_i32 s18, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 -; SI-NEXT: s_and_b32 s6, s59, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s76, s48, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 -; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 -; SI-NEXT: s_add_i32 s11, s72, 0x300 -; SI-NEXT: s_add_i32 s72, s79, 0x300 -; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s77, s37, 0xff -; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 -; SI-NEXT: s_add_i32 s21, s89, 0x300 -; SI-NEXT: s_add_i32 s89, s88, 0x300 -; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 -; SI-NEXT: s_and_b32 s79, s35, 0xff -; SI-NEXT: s_lshl_b32 s88, s88, 8 -; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 -; SI-NEXT: s_add_i32 s25, s92, 0x300 -; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 -; SI-NEXT: s_and_b32 s88, s30, 0xff -; SI-NEXT: s_lshl_b32 s92, s92, 8 -; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 -; SI-NEXT: s_add_i32 s94, s92, 3 -; SI-NEXT: s_and_b32 s92, s94, 0xff -; SI-NEXT: s_lshl_b32 s91, s91, 8 -; SI-NEXT: s_add_i32 s90, s90, 3 -; SI-NEXT: s_or_b32 s91, s91, s92 -; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s31, 8 -; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: v_readlane_b32 s76, v61, 53 +; SI-NEXT: s_add_i32 s23, s78, 0x300 +; SI-NEXT: s_add_i32 s38, s76, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 52 +; SI-NEXT: s_and_b32 s76, s38, 0xff +; SI-NEXT: s_lshl_b32 s78, s78, 8 +; SI-NEXT: s_or_b32 s76, s78, s76 +; SI-NEXT: v_readlane_b32 s78, v61, 51 +; SI-NEXT: s_add_i32 s36, s78, 3 +; SI-NEXT: s_add_i32 s27, s88, 0x300 +; SI-NEXT: s_add_i32 s88, s79, 0x300 +; SI-NEXT: s_and_b32 s78, s36, 0xff +; SI-NEXT: s_lshl_b32 s79, s89, 8 +; SI-NEXT: s_or_b32 s78, s79, s78 +; SI-NEXT: v_readlane_b32 s79, v61, 50 +; SI-NEXT: s_add_i32 s31, s79, 3 +; SI-NEXT: s_and_b32 s79, s31, 0xff +; SI-NEXT: s_lshl_b32 s89, s95, 8 +; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: s_or_b32 s79, s89, s79 +; SI-NEXT: v_readlane_b32 s89, v61, 49 +; SI-NEXT: s_and_b32 s92, s92, 0xff +; SI-NEXT: s_lshl_b32 s90, s90, 8 +; SI-NEXT: s_add_i32 s95, s89, 3 +; SI-NEXT: s_or_b32 s90, s90, s92 +; SI-NEXT: v_readlane_b32 s92, v61, 48 +; SI-NEXT: s_and_b32 s89, s95, 0xff +; SI-NEXT: s_lshl_b32 s93, s65, 8 ; SI-NEXT: s_add_i32 s92, s92, 3 -; SI-NEXT: s_add_i32 s26, s93, 0x300 +; SI-NEXT: s_or_b32 s89, s93, s89 ; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_lshl_b32 s93, s37, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 47 ; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_add_i32 s40, s94, 0x300 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_lshl_b32 s94, s97, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 46 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_lshl_b32 s95, s30, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 ; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: s_and_b32 s7, s59, 0xff ; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: v_readlane_b32 s30, v61, 0 -; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 +; SI-NEXT: s_or_b32 s91, s9, s7 +; SI-NEXT: s_add_i32 s7, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: v_readlane_b32 s30, v61, 3 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 ; SI-NEXT: v_readlane_b32 s30, v61, 2 -; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 +; SI-NEXT: s_add_i32 s41, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: v_readlane_b32 s30, v61, 45 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: v_readlane_b32 s30, v61, 44 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: v_readlane_b32 s30, v61, 43 ; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: v_readlane_b32 s31, v61, 42 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 42 -; SI-NEXT: s_add_i32 s29, s34, 0x300 +; SI-NEXT: v_readlane_b32 s31, v61, 41 ; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: v_readlane_b32 s34, v61, 40 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 @@ -176431,22 +177050,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 ; SI-NEXT: s_addk_i32 s30, 0x300 ; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: v_readlane_b32 s34, v61, 39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 +; SI-NEXT: s_add_i32 s42, s35, 0x300 ; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 39 -; SI-NEXT: s_and_b32 s34, s34, 0xff +; SI-NEXT: v_readlane_b32 s35, v61, 38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: s_and_b32 s34, s34, 0xff ; SI-NEXT: s_lshl_b32 s35, s35, 8 ; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: v_readlane_b32 s35, v61, 37 ; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: v_readlane_b32 s36, v61, 36 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo @@ -176455,43 +177075,47 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s35, s36, s35 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_add_i32 s7, s60, 0x300 -; SI-NEXT: s_add_i32 s8, s61, 0x300 -; SI-NEXT: s_add_i32 s9, s62, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_add_i32 s40, s39, 0x300 -; SI-NEXT: s_add_i32 s41, s52, 0x300 -; SI-NEXT: s_add_i32 s42, s53, 0x300 -; SI-NEXT: s_add_i32 s43, s64, 0x300 -; SI-NEXT: s_add_i32 s44, s66, 0x300 -; SI-NEXT: s_add_i32 s45, s67, 0x300 -; SI-NEXT: s_add_i32 s46, s68, 0x300 -; SI-NEXT: s_add_i32 s47, s69, 0x300 -; SI-NEXT: s_add_i32 s56, s70, 0x300 -; SI-NEXT: s_add_i32 s57, s71, 0x300 -; SI-NEXT: s_add_i32 s58, s81, 0x300 -; SI-NEXT: s_add_i32 s59, s83, 0x300 -; SI-NEXT: s_add_i32 s60, s85, 0x300 -; SI-NEXT: s_add_i32 s61, s96, 0x300 -; SI-NEXT: s_add_i32 s62, s97, 0x300 -; SI-NEXT: s_addk_i32 s63, 0x300 -; SI-NEXT: s_addk_i32 s78, 0x300 -; SI-NEXT: s_addk_i32 s23, 0x300 -; SI-NEXT: s_addk_i32 s19, 0x300 -; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_add_i32 s9, s60, 0x300 +; SI-NEXT: s_add_i32 s10, s61, 0x300 +; SI-NEXT: s_add_i32 s11, s62, 0x300 +; SI-NEXT: s_add_i32 s12, s63, 0x300 +; SI-NEXT: s_add_i32 s13, s72, 0x300 +; SI-NEXT: s_add_i32 s15, s74, 0x300 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_addk_i32 s27, 0x300 +; SI-NEXT: s_add_i32 s43, s48, 0x300 +; SI-NEXT: s_add_i32 s44, s51, 0x300 +; SI-NEXT: s_add_i32 s45, s54, 0x300 +; SI-NEXT: s_add_i32 s46, s55, 0x300 +; SI-NEXT: s_add_i32 s47, s66, 0x300 +; SI-NEXT: s_add_i32 s56, s69, 0x300 +; SI-NEXT: s_add_i32 s57, s71, 0x300 +; SI-NEXT: s_add_i32 s58, s80, 0x300 +; SI-NEXT: s_add_i32 s59, s81, 0x300 +; SI-NEXT: s_add_i32 s60, s82, 0x300 +; SI-NEXT: s_add_i32 s61, s83, 0x300 +; SI-NEXT: s_add_i32 s62, s84, 0x300 +; SI-NEXT: s_add_i32 s63, s86, 0x300 +; SI-NEXT: s_add_i32 s72, s96, 0x300 +; SI-NEXT: s_add_i32 s74, s98, 0x300 +; SI-NEXT: s_addk_i32 s77, 0x300 +; SI-NEXT: s_addk_i32 s91, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s25, 0x300 +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_addk_i32 s26, 0x300 +; SI-NEXT: s_addk_i32 s28, 0x300 +; SI-NEXT: s_addk_i32 s29, 0x300 ; SI-NEXT: s_addk_i32 s73, 0x300 -; SI-NEXT: s_addk_i32 s74, 0x300 ; SI-NEXT: s_addk_i32 s75, 0x300 ; SI-NEXT: s_addk_i32 s76, 0x300 -; SI-NEXT: s_addk_i32 s77, 0x300 +; SI-NEXT: s_addk_i32 s78, 0x300 ; SI-NEXT: s_addk_i32 s79, 0x300 -; SI-NEXT: s_addk_i32 s88, 0x300 -; SI-NEXT: s_addk_i32 s91, 0x300 +; SI-NEXT: s_addk_i32 s89, 0x300 ; SI-NEXT: s_addk_i32 s90, 0x300 ; SI-NEXT: s_addk_i32 s92, 0x300 ; SI-NEXT: s_addk_i32 s93, 0x300 @@ -176505,60 +177129,60 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s92 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: .LBB93_3: ; %end @@ -176635,7 +177259,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -176650,7 +177274,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 ; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -176851,79 +177475,81 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: .LBB93_4: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s38 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_mov_b32 s21, s48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s25, s77 ; SI-NEXT: s_mov_b32 s29, s76 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b32 s27, s23 +; SI-NEXT: s_mov_b32 s23, s16 +; SI-NEXT: s_mov_b32 s58, s57 +; SI-NEXT: s_mov_b32 s56, s46 +; SI-NEXT: s_mov_b32 s46, s43 +; SI-NEXT: s_mov_b32 s43, s40 +; SI-NEXT: s_mov_b32 s40, s14 +; SI-NEXT: s_mov_b32 s14, s13 +; SI-NEXT: s_mov_b32 s13, s9 +; SI-NEXT: s_mov_b32 s82, s71 +; SI-NEXT: s_mov_b32 s71, s69 +; SI-NEXT: s_mov_b32 s69, s55 +; SI-NEXT: s_mov_b32 s55, s51 +; SI-NEXT: s_mov_b32 s51, s91 +; SI-NEXT: s_mov_b32 s91, s94 +; SI-NEXT: s_mov_b32 s44, s45 +; SI-NEXT: s_mov_b32 s8, s11 +; SI-NEXT: s_mov_b32 s30, s97 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s66, v61, 33 -; SI-NEXT: v_readlane_b32 s68, v61, 34 -; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_readlane_b32 s76, v61, 4 +; SI-NEXT: v_readlane_b32 s77, v61, 6 +; SI-NEXT: v_readlane_b32 s78, v61, 5 +; SI-NEXT: v_readlane_b32 s88, v61, 8 +; SI-NEXT: v_readlane_b32 s35, v61, 10 +; SI-NEXT: v_readlane_b32 s48, v61, 9 +; SI-NEXT: v_readlane_b32 s38, v61, 11 +; SI-NEXT: v_readlane_b32 s93, v61, 12 +; SI-NEXT: v_readlane_b32 s49, v61, 14 +; SI-NEXT: v_readlane_b32 s54, v61, 13 +; SI-NEXT: v_readlane_b32 s31, v61, 16 +; SI-NEXT: v_readlane_b32 s52, v61, 15 +; SI-NEXT: v_readlane_b32 s66, v61, 18 +; SI-NEXT: v_readlane_b32 s64, v61, 17 +; SI-NEXT: v_readlane_b32 s36, v61, 19 +; SI-NEXT: v_readlane_b32 s80, v61, 20 +; SI-NEXT: v_readlane_b32 s67, v61, 22 +; SI-NEXT: v_readlane_b32 s70, v61, 21 +; SI-NEXT: v_readlane_b32 s39, v61, 24 +; SI-NEXT: v_readlane_b32 s68, v61, 23 +; SI-NEXT: v_readlane_b32 s81, v61, 26 +; SI-NEXT: v_readlane_b32 s83, v61, 25 +; SI-NEXT: v_readlane_b32 s85, v61, 27 +; SI-NEXT: v_readlane_b32 s86, v61, 28 +; SI-NEXT: v_readlane_b32 s50, v61, 30 +; SI-NEXT: v_readlane_b32 s53, v61, 29 +; SI-NEXT: v_readlane_b32 s98, v61, 32 +; SI-NEXT: s_mov_b32 s97, s90 +; SI-NEXT: v_readlane_b32 s99, v61, 31 +; SI-NEXT: s_mov_b32 s37, s87 +; SI-NEXT: v_readlane_b32 s6, v61, 33 +; SI-NEXT: s_mov_b32 s90, s79 +; SI-NEXT: s_mov_b32 s65, s18 +; SI-NEXT: s_mov_b32 s95, s22 +; SI-NEXT: s_mov_b32 s89, s10 +; SI-NEXT: v_readlane_b32 s26, v61, 34 +; SI-NEXT: v_readlane_b32 s96, v61, 35 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -179145,29 +179771,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36 @@ -179199,16 +179825,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -179264,7 +179890,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 @@ -179281,6 +179907,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -179289,115 +179918,139 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -179405,40 +180058,40 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -179579,112 +180232,112 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -179698,7 +180351,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -179707,7 +180360,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -179805,32 +180458,32 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 @@ -179969,31 +180622,31 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 @@ -180025,16 +180678,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -180090,7 +180743,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 @@ -180107,6 +180760,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff @@ -180115,115 +180771,139 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -180231,40 +180911,40 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -180272,7 +180952,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 @@ -180400,117 +181080,117 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v179 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v128 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -180524,7 +181204,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -180533,7 +181213,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -180631,33 +181311,33 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 @@ -185290,482 +185970,482 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] ; GFX11-FAKE16-NEXT: .LBB94_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB94_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v61 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v73 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v55, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v65, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v101 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -185787,30 +186467,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v41 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v181 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v167 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -185832,29 +186512,29 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v147 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v144 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v131 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v119 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -185877,31 +186557,31 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v102 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v70 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -190499,257 +191179,258 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-NEXT: v_writelane_b32 v75, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s40, v1 ; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-NEXT: v_writelane_b32 v75, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-NEXT: v_writelane_b32 v75, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-NEXT: v_writelane_b32 v75, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-NEXT: v_writelane_b32 v75, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 -; GFX11-NEXT: s_mov_b32 s99, 0 +; GFX11-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-NEXT: v_writelane_b32 v75, s102, 6 ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-NEXT: v_writelane_b32 v74, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s27, 8 ; GFX11-NEXT: s_lshr_b32 s43, s27, 24 ; GFX11-NEXT: s_lshr_b32 s34, s5, 24 ; GFX11-NEXT: s_lshr_b32 s35, s5, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s26, 16 ; GFX11-NEXT: s_lshr_b32 s37, s5, 8 ; GFX11-NEXT: s_lshr_b32 s36, s4, 16 ; GFX11-NEXT: s_lshr_b32 s38, s4, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s26, 8 ; GFX11-NEXT: s_lshr_b32 s39, s7, 24 ; GFX11-NEXT: s_lshr_b32 s48, s7, 16 ; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s25, 24 ; GFX11-NEXT: s_lshr_b32 s49, s6, 16 ; GFX11-NEXT: s_lshr_b32 s51, s6, 8 ; GFX11-NEXT: s_lshr_b32 s52, s9, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_lshr_b32 s53, s9, 16 ; GFX11-NEXT: s_lshr_b32 s55, s9, 8 ; GFX11-NEXT: s_lshr_b32 s54, s8, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s25, 8 ; GFX11-NEXT: s_lshr_b32 s64, s8, 8 ; GFX11-NEXT: s_lshr_b32 s65, s11, 24 ; GFX11-NEXT: s_lshr_b32 s66, s11, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s24, 16 ; GFX11-NEXT: s_lshr_b32 s68, s11, 8 ; GFX11-NEXT: s_lshr_b32 s67, s10, 16 ; GFX11-NEXT: s_lshr_b32 s69, s10, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v76, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s24, 8 ; GFX11-NEXT: s_lshr_b32 s70, s13, 24 ; GFX11-NEXT: s_lshr_b32 s71, s13, 16 ; GFX11-NEXT: s_lshr_b32 s81, s13, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-NEXT: v_writelane_b32 v77, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_lshr_b32 s82, s12, 8 ; GFX11-NEXT: s_lshr_b32 s83, s15, 24 ; GFX11-NEXT: s_lshr_b32 s84, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v77, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_lshr_b32 s86, s15, 8 ; GFX11-NEXT: s_lshr_b32 s85, s14, 16 ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v77, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 ; GFX11-NEXT: s_lshr_b32 s96, s41, 24 ; GFX11-NEXT: s_lshr_b32 s97, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 ; GFX11-NEXT: s_lshr_b32 s98, s40, 16 -; GFX11-NEXT: s_lshr_b32 s101, s40, 8 -; GFX11-NEXT: s_lshr_b32 s102, s29, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-NEXT: s_lshr_b32 s100, s40, 8 +; GFX11-NEXT: s_lshr_b32 s101, s29, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s103, s29, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 8 -; GFX11-NEXT: s_lshr_b32 s104, s28, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-NEXT: v_writelane_b32 v77, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-NEXT: v_writelane_b32 v77, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s20, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-NEXT: v_writelane_b32 v77, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-NEXT: v_writelane_b32 v77, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-NEXT: v_writelane_b32 v77, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-NEXT: v_writelane_b32 v77, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s18, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-NEXT: v_writelane_b32 v77, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-NEXT: v_writelane_b32 v77, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-NEXT: v_writelane_b32 v77, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s17, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-NEXT: v_writelane_b32 v77, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-NEXT: v_writelane_b32 v77, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s16, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-NEXT: v_writelane_b32 v77, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-NEXT: v_writelane_b32 v77, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-NEXT: v_writelane_b32 v77, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-NEXT: v_writelane_b32 v77, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: v_writelane_b32 v77, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: v_writelane_b32 v77, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: v_writelane_b32 v77, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: v_writelane_b32 v77, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s0, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: v_writelane_b32 v77, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: v_writelane_b32 v77, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s28, 8 -; GFX11-NEXT: v_writelane_b32 v78, s74, 0 -; GFX11-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-NEXT: v_writelane_b32 v77, s74, 0 +; GFX11-NEXT: v_writelane_b32 v77, s75, 1 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi ; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 ; GFX11-NEXT: .LBB95_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s5 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v53, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v33, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v32, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s41 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s40 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s41 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s40 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] @@ -190760,115 +191441,112 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v53, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v55, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v21, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[24:25] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] ; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] -; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[20:21] +; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] ; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] ; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] ; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 24, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 16, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 24, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 8, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v13 ; GFX11-NEXT: s_branch .LBB95_5 ; GFX11-NEXT: .LBB95_3: ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s99, -1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr44 @@ -190876,13 +191554,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr58 ; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr98 ; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr99 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr87 @@ -190926,8 +191604,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v77, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -191006,419 +191684,422 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-NEXT: v_dual_mov_b32 v54, s0 :: v_dual_mov_b32 v55, s1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 3 ; GFX11-NEXT: v_mov_b32_e32 v71, s50 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v74, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 ; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 ; GFX11-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 -; GFX11-NEXT: v_mov_b32_e32 v73, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 4 ; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 ; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 -; GFX11-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 5 ; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 -; GFX11-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_dual_mov_b32 v51, s48 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v38, s16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-NEXT: v_dual_mov_b32 v39, s17 :: v_dual_mov_b32 v62, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_dual_mov_b32 v49, s39 :: v_dual_mov_b32 v36, s18 ; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 ; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-NEXT: v_readlane_b32 s0, v77, 8 ; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 ; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 -; GFX11-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 -; GFX11-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v20, s26 +; GFX11-NEXT: v_mov_b32_e32 v21, s27 +; GFX11-NEXT: v_dual_mov_b32 v145, s42 :: v_dual_mov_b32 v144, s103 +; GFX11-NEXT: v_mov_b32_e32 v58, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-NEXT: v_mov_b32_e32 v31, s36 +; GFX11-NEXT: v_dual_mov_b32 v135, s104 :: v_dual_mov_b32 v134, s102 +; GFX11-NEXT: v_dual_mov_b32 v133, s101 :: v_dual_mov_b32 v132, s100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v56, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-NEXT: v_dual_mov_b32 v131, s98 :: v_dual_mov_b32 v130, s99 +; GFX11-NEXT: v_dual_mov_b32 v129, s97 :: v_dual_mov_b32 v128, s96 ; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 -; GFX11-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 -; GFX11-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 -; GFX11-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 -; GFX11-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-NEXT: v_dual_mov_b32 v119, s87 :: v_dual_mov_b32 v118, s85 +; GFX11-NEXT: v_dual_mov_b32 v117, s86 :: v_dual_mov_b32 v116, s84 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 -; GFX11-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 -; GFX11-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 -; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-NEXT: v_mov_b32_e32 v27, s37 +; GFX11-NEXT: v_dual_mov_b32 v115, s83 :: v_dual_mov_b32 v114, s82 +; GFX11-NEXT: v_dual_mov_b32 v113, s80 :: v_dual_mov_b32 v112, s81 +; GFX11-NEXT: v_mov_b32_e32 v46, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-NEXT: v_dual_mov_b32 v103, s71 :: v_dual_mov_b32 v102, s70 +; GFX11-NEXT: v_dual_mov_b32 v101, s69 :: v_dual_mov_b32 v100, s67 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 -; GFX11-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-NEXT: v_dual_mov_b32 v99, s68 :: v_dual_mov_b32 v98, s66 +; GFX11-NEXT: v_dual_mov_b32 v97, s65 :: v_dual_mov_b32 v96, s64 ; GFX11-NEXT: v_mov_b32_e32 v43, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 -; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-NEXT: v_mov_b32_e32 v23, s35 +; GFX11-NEXT: v_dual_mov_b32 v87, s54 :: v_dual_mov_b32 v86, s55 +; GFX11-NEXT: v_dual_mov_b32 v85, s53 :: v_dual_mov_b32 v84, s52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v44, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-NEXT: v_mov_b32_e32 v19, s34 +; GFX11-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v82, s49 +; GFX11-NEXT: v_dual_mov_b32 v65, s60 :: v_dual_mov_b32 v30, s90 ; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 -; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s92 +; GFX11-NEXT: v_mov_b32_e32 v17, s74 +; GFX11-NEXT: v_mov_b32_e32 v67, s30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v41, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: v_mov_b32_e32 v48, s62 -; GFX11-NEXT: v_mov_b32_e32 v54, s72 -; GFX11-NEXT: v_mov_b32_e32 v64, s60 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v80, s46 -; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-NEXT: v_mov_b32_e32 v146, s43 +; GFX11-NEXT: v_mov_b32_e32 v50, s62 +; GFX11-NEXT: v_mov_b32_e32 v64, s72 +; GFX11-NEXT: v_mov_b32_e32 v66, s58 ; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-NEXT: v_mov_b32_e32 v80, s44 +; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_mov_b32_e32 v22, s78 ; GFX11-NEXT: v_mov_b32_e32 v182, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-NEXT: v_mov_b32_e32 v26, s88 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v48, s94 :: v_dual_mov_b32 v183, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 22 ; GFX11-NEXT: v_mov_b32_e32 v181, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-NEXT: v_readlane_b32 s0, v77, 23 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 24 ; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v177, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v176, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-NEXT: v_readlane_b32 s0, v77, 28 ; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-NEXT: v_readlane_b32 s0, v77, 29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v165, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 ; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 30 ; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 ; GFX11-NEXT: v_mov_b32_e32 v163, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_readlane_b32 s0, v76, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 -; GFX11-NEXT: v_mov_b32_e32 v160, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 ; GFX11-NEXT: v_mov_b32_e32 v161, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 -; GFX11-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-NEXT: v_readlane_b32 s1, v77, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v81, s0 ; GFX11-NEXT: .LBB95_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v82 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v63 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GFX11-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v73 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v55 ; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v66, v69, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v81 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v72 ; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-NEXT: v_and_b32_e32 v46, 0xff, v46 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v68, v81, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-NEXT: v_or_b32_e32 v70, v46, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 ; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v81 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v61 +; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v62, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v81, v61 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-NEXT: v_or_b32_e32 v69, v41, v69 +; GFX11-NEXT: v_or_b32_e32 v52, v52, v54 +; GFX11-NEXT: v_or_b32_e32 v54, v62, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54 ; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_or_b32_e32 v69, v69, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v68 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v53 +; GFX11-NEXT: v_or_b32_e32 v53, v55, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v47 ; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_or_b32_e32 v82, v50, v82 -; GFX11-NEXT: v_or_b32_e32 v81, v60, v81 -; GFX11-NEXT: v_or_b32_e32 v50, v52, v66 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v69 -; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v51 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-NEXT: v_or_b32_e32 v51, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v66, v66, v69 -; GFX11-NEXT: v_or_b32_e32 v69, v57, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v47 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v53, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v68 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v58 +; GFX11-NEXT: v_or_b32_e32 v80, v56, v57 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v45 +; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v44 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v70 ; GFX11-NEXT: v_or_b32_e32 v39, v39, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-NEXT: v_or_b32_e32 v70, v81, v43 ; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v42 -; GFX11-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v42 +; GFX11-NEXT: v_or_b32_e32 v36, v38, v68 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v80, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v70 ; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v183 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v40 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v182 +; GFX11-NEXT: v_lshlrev_b32_e32 v182, 8, v183 ; GFX11-NEXT: v_or_b32_e32 v37, v38, v39 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v68 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v70, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v181 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v182 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-NEXT: v_or_b32_e32 v68, v70, v80 +; GFX11-NEXT: v_or_b32_e32 v69, v81, v182 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v68, 0xffff, v68 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v67, v80, v67 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-NEXT: v_or_b32_e32 v66, v80, v66 ; GFX11-NEXT: v_or_b32_e32 v33, v33, v81 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v39, v66, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v67 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v178 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v179 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v176 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v165 -; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v166 -; GFX11-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-NEXT: v_or_b32_e32 v39, v68, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v178 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v167 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v166 +; GFX11-NEXT: v_and_b32_e32 v164, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v165 +; GFX11-NEXT: v_or_b32_e32 v68, v68, v69 ; GFX11-NEXT: v_or_b32_e32 v28, v28, v70 -; GFX11-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-NEXT: v_or_b32_e32 v65, v80, v65 ; GFX11-NEXT: v_or_b32_e32 v29, v29, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-NEXT: v_or_b32_e32 v69, v164, v165 ; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 ; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v65 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 ; GFX11-NEXT: v_or_b32_e32 v36, v32, v66 -; GFX11-NEXT: v_or_b32_e32 v37, v33, v67 -; GFX11-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-NEXT: v_or_b32_e32 v37, v33, v68 +; GFX11-NEXT: v_or_b32_e32 v38, v28, v65 ; GFX11-NEXT: v_or_b32_e32 v39, v29, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v54 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v160 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v150 ; GFX11-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 ; GFX11-NEXT: v_or_b32_e32 v25, v25, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v52 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v150 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-NEXT: v_or_b32_e32 v29, v52, v53 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v50 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v147 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v147 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v146 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v54 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v48 -; GFX11-NEXT: v_or_b32_e32 v33, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v52 -; GFX11-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_or_b32_e32 v33, v52, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v67 ; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v50, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v52, v20, v32 -; GFX11-NEXT: v_or_b32_e32 v53, v21, v33 -; GFX11-NEXT: v_or_b32_e32 v64, v15, v48 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v134 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 -; GFX11-NEXT: v_or_b32_e32 v51, v25, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v54 +; GFX11-NEXT: v_or_b32_e32 v50, v55, v64 +; GFX11-NEXT: v_or_b32_e32 v52, v24, v28 +; GFX11-NEXT: v_or_b32_e32 v53, v25, v29 +; GFX11-NEXT: v_or_b32_e32 v54, v20, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v21, v33 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v131 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v129 -; GFX11-NEXT: v_or_b32_e32 v20, v25, v28 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v134 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v132 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v130 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v20, v21, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v25 +; GFX11-NEXT: v_or_b32_e32 v21, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v129 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v119 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v34 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v116 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v24 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v117 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v115 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v28 +; GFX11-NEXT: v_or_b32_e32 v25, v29, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-NEXT: v_or_b32_e32 v28, v34, v48 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 -; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v32 -; GFX11-NEXT: v_or_b32_e32 v25, v33, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: v_or_b32_e32 v65, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v66, v13, v20 -; GFX11-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v21 +; GFX11-NEXT: v_or_b32_e32 v64, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v65, v12, v28 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v30 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v113 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v112 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v103 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v112 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v102 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX11-NEXT: v_or_b32_e32 v14, v20, v21 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v101 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v103 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v28 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v87 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v99 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v98 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v20 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v25 -; GFX11-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-NEXT: v_or_b32_e32 v12, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v98 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v97 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v28 +; GFX11-NEXT: v_or_b32_e32 v22, v29, v22 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_or_b32_e32 v16, v21, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v13, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v14, v10, v14 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX11-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v66, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v67, v10, v12 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v20 +; GFX11-NEXT: v_or_b32_e32 v9, v5, v22 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v82 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v21 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v83 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v71 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 ; GFX11-NEXT: v_or_b32_e32 v10, v20, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v49 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v51 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v35 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v31 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 @@ -191426,99 +192107,101 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v27 ; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v23 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v18 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-NEXT: v_or_b32_e32 v12, v20, v17 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v21 ; GFX11-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v50 ; GFX11-NEXT: v_or_b32_e32 v10, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v11 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v19, v12 ; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-NEXT: v_readlane_b32 s30, v74, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -197888,8 +198571,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill @@ -197907,46 +198590,38 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 ; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v62, s28, 0 -; SI-NEXT: v_writelane_b32 v62, s25, 1 -; SI-NEXT: v_writelane_b32 v62, s24, 2 -; SI-NEXT: v_writelane_b32 v62, s23, 3 -; SI-NEXT: v_writelane_b32 v62, s22, 4 -; SI-NEXT: v_writelane_b32 v62, s21, 5 -; SI-NEXT: v_writelane_b32 v62, s18, 6 -; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -197961,14 +198636,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s52, 12 ; SI-NEXT: v_writelane_b32 v63, s53, 13 ; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v62, s28, 0 ; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v62, s27, 1 ; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v62, s25, 2 ; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v62, s24, 3 ; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v62, s23, 4 ; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v62, s22, 5 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v62, s21, 6 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s71, 23 ; SI-NEXT: v_writelane_b32 v63, s80, 24 ; SI-NEXT: v_writelane_b32 v63, s81, 25 @@ -197978,258 +198662,257 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: s_mov_b32 s76, s18 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, v26 ; SI-NEXT: v_readfirstlane_b32 s15, v16 -; SI-NEXT: v_readfirstlane_b32 s18, v25 -; SI-NEXT: v_readfirstlane_b32 s43, v15 +; SI-NEXT: v_readfirstlane_b32 s16, v15 ; SI-NEXT: v_readfirstlane_b32 s42, v24 -; SI-NEXT: v_readfirstlane_b32 s44, v23 -; SI-NEXT: v_readfirstlane_b32 s49, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s53, v20 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s49, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_readfirstlane_b32 s31, v28 +; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s50, v35 +; SI-NEXT: v_readfirstlane_b32 s38, v37 +; SI-NEXT: v_readfirstlane_b32 s64, v48 +; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s7, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: v_readfirstlane_b32 s79, v52 -; SI-NEXT: v_readfirstlane_b32 s88, v54 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:136 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_readfirstlane_b32 s77, v41 ; SI-NEXT: v_readfirstlane_b32 s4, v42 -; SI-NEXT: v_readfirstlane_b32 s94, v31 -; SI-NEXT: v_readfirstlane_b32 s70, v32 -; SI-NEXT: v_readfirstlane_b32 s51, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s78, v54 +; SI-NEXT: v_readfirstlane_b32 s27, v41 +; SI-NEXT: v_readfirstlane_b32 s91, v43 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; SI-NEXT: v_readfirstlane_b32 s54, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s37, v45 -; SI-NEXT: v_readfirstlane_b32 s24, v56 -; SI-NEXT: v_readfirstlane_b32 s7, v57 -; SI-NEXT: v_readfirstlane_b32 s92, v58 -; SI-NEXT: v_readfirstlane_b32 s28, v59 +; SI-NEXT: v_readfirstlane_b32 s37, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v60 +; SI-NEXT: v_readfirstlane_b32 s71, v58 +; SI-NEXT: v_readfirstlane_b32 s21, v59 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s4, v61 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s35, v43 -; SI-NEXT: v_readfirstlane_b32 s55, v46 -; SI-NEXT: v_readfirstlane_b32 s68, v35 -; SI-NEXT: v_readfirstlane_b32 s87, v37 -; SI-NEXT: v_readfirstlane_b32 s67, v39 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 +; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s74, v53 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: v_readfirstlane_b32 s85, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_readfirstlane_b32 s69, v46 +; SI-NEXT: v_readfirstlane_b32 s66, v56 +; SI-NEXT: v_readfirstlane_b32 s28, v34 +; SI-NEXT: v_readfirstlane_b32 s6, v36 +; SI-NEXT: v_readfirstlane_b32 s85, v38 +; SI-NEXT: v_readfirstlane_b32 s70, v39 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: v_readfirstlane_b32 s98, v40 -; SI-NEXT: v_readfirstlane_b32 s69, v51 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v28 -; SI-NEXT: v_readfirstlane_b32 s34, v27 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s86, v50 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v21, v13 ; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_readfirstlane_b32 s97, v29 -; SI-NEXT: v_readfirstlane_b32 s80, v18 +; SI-NEXT: v_readfirstlane_b32 s99, v27 +; SI-NEXT: v_readfirstlane_b32 s89, v29 +; SI-NEXT: v_readfirstlane_b32 s97, v25 +; SI-NEXT: v_readfirstlane_b32 s75, v23 +; SI-NEXT: v_readfirstlane_b32 s79, v19 +; SI-NEXT: v_readfirstlane_b32 s83, v18 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v22 ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30 -; SI-NEXT: v_readfirstlane_b32 s96, v17 -; SI-NEXT: v_readfirstlane_b32 s64, v9 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s83, v7 -; SI-NEXT: v_readfirstlane_b32 s84, v4 -; SI-NEXT: v_readfirstlane_b32 s93, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s58, v38 -; SI-NEXT: v_readfirstlane_b32 s65, v49 -; SI-NEXT: v_readfirstlane_b32 s62, v54 -; SI-NEXT: v_readfirstlane_b32 s81, v44 -; SI-NEXT: v_readfirstlane_b32 s71, v47 -; SI-NEXT: v_readfirstlane_b32 s38, v60 -; SI-NEXT: v_readfirstlane_b32 s86, v61 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220 +; SI-NEXT: v_readfirstlane_b32 s34, v17 +; SI-NEXT: v_readfirstlane_b32 s68, v10 +; SI-NEXT: v_readfirstlane_b32 s77, v9 +; SI-NEXT: v_readfirstlane_b32 s80, v8 +; SI-NEXT: v_readfirstlane_b32 s84, v7 +; SI-NEXT: v_readfirstlane_b32 s95, v4 +; SI-NEXT: v_readfirstlane_b32 s96, v3 +; SI-NEXT: v_readfirstlane_b32 s67, v2 +; SI-NEXT: v_readfirstlane_b32 s59, v37 +; SI-NEXT: v_readfirstlane_b32 s94, v48 +; SI-NEXT: v_readfirstlane_b32 s35, v51 +; SI-NEXT: v_readfirstlane_b32 s81, v41 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s74, v45 +; SI-NEXT: v_readfirstlane_b32 s18, v47 +; SI-NEXT: v_readfirstlane_b32 s51, v57 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s90, v50 -; SI-NEXT: v_readfirstlane_b32 s31, v52 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s57, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s56, v61 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s82, v56 -; SI-NEXT: v_readfirstlane_b32 s95, v57 +; SI-NEXT: v_readfirstlane_b32 s55, v59 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s39, v58 -; SI-NEXT: v_readfirstlane_b32 s56, v59 -; SI-NEXT: v_readfirstlane_b32 s57, v41 -; SI-NEXT: v_readfirstlane_b32 s36, v42 -; SI-NEXT: v_readfirstlane_b32 s73, v45 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 -; SI-NEXT: v_readfirstlane_b32 s16, v34 -; SI-NEXT: v_readfirstlane_b32 s48, v32 -; SI-NEXT: v_readfirstlane_b32 s52, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_readfirstlane_b32 s47, v35 -; SI-NEXT: v_readfirstlane_b32 s60, v37 -; SI-NEXT: v_readfirstlane_b32 s61, v39 -; SI-NEXT: v_readfirstlane_b32 s89, v43 +; SI-NEXT: v_readfirstlane_b32 s58, v54 +; SI-NEXT: v_readfirstlane_b32 s60, v40 +; SI-NEXT: v_readfirstlane_b32 s30, v42 +; SI-NEXT: v_readfirstlane_b32 s53, v43 +; SI-NEXT: v_readfirstlane_b32 s22, v58 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 +; SI-NEXT: v_readfirstlane_b32 s98, v32 +; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: v_readfirstlane_b32 s61, v34 +; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s46, v46 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s99, v46 +; SI-NEXT: v_readfirstlane_b32 s23, v56 +; SI-NEXT: v_writelane_b32 v62, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s52, v35 +; SI-NEXT: v_readfirstlane_b32 s92, v53 +; SI-NEXT: v_readfirstlane_b32 s39, v38 +; SI-NEXT: v_readfirstlane_b32 s72, v50 +; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s25, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s24, v41 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s45, v44 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: v_readfirstlane_b32 s65, v36 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: v_readfirstlane_b32 s54, v48 -; SI-NEXT: v_readfirstlane_b32 s50, v53 -; SI-NEXT: v_readfirstlane_b32 s78, v49 -; SI-NEXT: v_readfirstlane_b32 s30, v51 -; SI-NEXT: v_readfirstlane_b32 s66, v54 -; SI-NEXT: v_readfirstlane_b32 s91, v40 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v44 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s15, 18 -; SI-NEXT: v_writelane_b32 v62, s18, 19 -; SI-NEXT: v_writelane_b32 v62, s43, 20 -; SI-NEXT: v_writelane_b32 v62, s42, 21 -; SI-NEXT: v_writelane_b32 v62, s44, 22 -; SI-NEXT: v_writelane_b32 v62, s16, 23 -; SI-NEXT: v_writelane_b32 v62, s49, 24 -; SI-NEXT: v_writelane_b32 v62, s8, 25 -; SI-NEXT: v_writelane_b32 v62, s6, 26 -; SI-NEXT: v_readfirstlane_b32 s45, v52 -; SI-NEXT: v_writelane_b32 v62, s56, 27 -; SI-NEXT: v_writelane_b32 v62, s45, 28 -; SI-NEXT: v_writelane_b32 v62, s53, 29 -; SI-NEXT: v_writelane_b32 v62, s94, 30 -; SI-NEXT: v_writelane_b32 v62, s57, 31 -; SI-NEXT: v_writelane_b32 v62, s58, 32 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s17, 19 +; SI-NEXT: v_writelane_b32 v62, s15, 20 +; SI-NEXT: v_writelane_b32 v62, s16, 21 +; SI-NEXT: v_writelane_b32 v62, s42, 22 +; SI-NEXT: v_writelane_b32 v62, s46, 23 +; SI-NEXT: v_writelane_b32 v62, s51, 24 +; SI-NEXT: v_writelane_b32 v62, s56, 25 +; SI-NEXT: v_writelane_b32 v62, s57, 26 +; SI-NEXT: v_writelane_b32 v62, s52, 27 +; SI-NEXT: v_writelane_b32 v62, s8, 28 +; SI-NEXT: v_writelane_b32 v62, s58, 29 +; SI-NEXT: v_writelane_b32 v62, s49, 30 +; SI-NEXT: v_writelane_b32 v62, s45, 31 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s47, v52 +; SI-NEXT: v_writelane_b32 v62, s59, 32 ; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_writelane_b32 v62, s40, 34 -; SI-NEXT: v_readfirstlane_b32 s59, v47 -; SI-NEXT: v_writelane_b32 v62, s46, 35 -; SI-NEXT: v_writelane_b32 v62, s59, 36 -; SI-NEXT: v_writelane_b32 v62, s60, 37 -; SI-NEXT: v_writelane_b32 v62, s36, 38 -; SI-NEXT: v_writelane_b32 v62, s65, 39 -; SI-NEXT: v_writelane_b32 v62, s61, 40 -; SI-NEXT: v_writelane_b32 v62, s73, 41 -; SI-NEXT: v_writelane_b32 v62, s62, 42 -; SI-NEXT: v_writelane_b32 v62, s72, 43 -; SI-NEXT: v_writelane_b32 v62, s23, 44 -; SI-NEXT: v_writelane_b32 v62, s48, 45 -; SI-NEXT: v_writelane_b32 v62, s34, 46 -; SI-NEXT: v_writelane_b32 v62, s78, 47 -; SI-NEXT: v_writelane_b32 v62, s30, 48 -; SI-NEXT: v_writelane_b32 v62, s54, 49 -; SI-NEXT: v_writelane_b32 v62, s50, 50 -; SI-NEXT: v_writelane_b32 v62, s52, 51 -; SI-NEXT: v_writelane_b32 v62, s82, 52 -; SI-NEXT: v_writelane_b32 v62, s66, 53 -; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_writelane_b32 v62, s60, 34 +; SI-NEXT: v_writelane_b32 v62, s40, 35 +; SI-NEXT: v_writelane_b32 v62, s61, 36 +; SI-NEXT: v_writelane_b32 v62, s31, 37 +; SI-NEXT: v_writelane_b32 v62, s72, 38 +; SI-NEXT: v_writelane_b32 v62, s73, 39 +; SI-NEXT: v_writelane_b32 v62, s36, 40 +; SI-NEXT: v_writelane_b32 v62, s92, 41 +; SI-NEXT: v_writelane_b32 v62, s93, 42 +; SI-NEXT: v_writelane_b32 v62, s55, 43 +; SI-NEXT: v_writelane_b32 v62, s25, 44 +; SI-NEXT: v_readfirstlane_b32 s88, v45 +; SI-NEXT: v_readfirstlane_b32 s90, v47 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v43 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v57 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v42 ; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v40 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v54 ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31 -; SI-NEXT: v_writelane_b32 v62, s91, 54 +; SI-NEXT: v_writelane_b32 v62, s24, 45 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v5, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_readlane_b32 s5, v62, 6 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_writelane_b32 v62, s4, 55 -; SI-NEXT: v_readlane_b32 s4, v62, 4 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_readlane_b32 s4, v62, 5 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 4 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s63, s5, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: s_and_b32 s5, s76, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s9, s19, 24 ; SI-NEXT: v_readlane_b32 s4, v62, 0 @@ -198237,175 +198920,185 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s5, s4, 0xff ; SI-NEXT: s_lshl_b32 s10, s29, 8 ; SI-NEXT: s_or_b32 s4, s5, s10 -; SI-NEXT: v_writelane_b32 v62, s4, 56 -; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 47 +; SI-NEXT: v_readlane_b32 s4, v62, 18 +; SI-NEXT: s_and_b32 s5, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s10, 24 -; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_lshl_b32 s11, s67, 24 +; SI-NEXT: s_or_b32 s62, s11, s5 ; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s27, 24 +; SI-NEXT: s_lshl_b32 s12, s4, 24 ; SI-NEXT: s_or_b32 s14, s12, s11 -; SI-NEXT: s_and_b32 s11, s83, 0xff -; SI-NEXT: s_lshl_b32 s12, s25, 8 -; SI-NEXT: s_or_b32 s10, s11, s12 -; SI-NEXT: v_writelane_b32 v62, s10, 57 -; SI-NEXT: s_and_b32 s11, s64, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 15 +; SI-NEXT: s_and_b32 s11, s84, 0xff +; SI-NEXT: s_lshl_b32 s12, s80, 8 +; SI-NEXT: s_or_b32 s4, s11, s12 +; SI-NEXT: s_and_b32 s11, s77, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s13, s10, 24 +; SI-NEXT: s_lshl_b32 s13, s68, 24 ; SI-NEXT: s_or_b32 s41, s13, s11 -; SI-NEXT: s_and_b32 s11, s43, 0xff +; SI-NEXT: s_and_b32 s11, s16, 0xff ; SI-NEXT: s_lshl_b32 s13, s15, 8 -; SI-NEXT: s_or_b32 s10, s11, s13 -; SI-NEXT: s_and_b32 s11, s96, 0xff +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: s_or_b32 s4, s11, s13 +; SI-NEXT: s_and_b32 s11, s34, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s80, 24 +; SI-NEXT: s_lshl_b32 s15, s83, 24 ; SI-NEXT: s_or_b32 s43, s15, s11 -; SI-NEXT: s_and_b32 s11, s44, 0xff +; SI-NEXT: s_and_b32 s11, s75, 0xff ; SI-NEXT: s_lshl_b32 s15, s42, 8 ; SI-NEXT: s_or_b32 s13, s11, s15 -; SI-NEXT: s_and_b32 s11, s18, 0xff +; SI-NEXT: s_and_b32 s11, s97, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s97, 24 +; SI-NEXT: s_lshl_b32 s15, s89, 24 ; SI-NEXT: s_or_b32 s44, s15, s11 -; SI-NEXT: s_and_b32 s11, s59, 0xff -; SI-NEXT: s_lshl_b32 s15, s46, 8 +; SI-NEXT: s_and_b32 s11, s90, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 8 ; SI-NEXT: s_or_b32 s12, s11, s15 -; SI-NEXT: s_and_b32 s11, s45, 0xff +; SI-NEXT: s_and_b32 s11, s47, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s6, 24 +; SI-NEXT: s_lshl_b32 s15, s45, 24 ; SI-NEXT: s_or_b32 s45, s15, s11 -; SI-NEXT: s_and_b32 s11, s30, 0xff -; SI-NEXT: s_lshl_b32 s15, s78, 8 -; SI-NEXT: v_writelane_b32 v62, s10, 58 +; SI-NEXT: s_and_b32 s11, s36, 0xff +; SI-NEXT: s_lshl_b32 s15, s72, 8 ; SI-NEXT: s_or_b32 s10, s11, s15 -; SI-NEXT: s_and_b32 s11, s99, 0xff +; SI-NEXT: s_and_b32 s11, s23, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s89, 24 +; SI-NEXT: s_lshl_b32 s15, s46, 24 ; SI-NEXT: s_or_b32 s46, s15, s11 +; SI-NEXT: s_and_b32 s11, s39, 0xff +; SI-NEXT: s_lshl_b32 s15, s65, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 49 +; SI-NEXT: s_or_b32 s4, s11, s15 ; SI-NEXT: s_and_b32 s11, s61, 0xff -; SI-NEXT: s_lshl_b32 s15, s60, 8 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s22, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s47, 24 +; SI-NEXT: s_lshl_b32 s15, s22, 24 ; SI-NEXT: s_or_b32 s47, s15, s11 -; SI-NEXT: s_and_b32 s11, s57, 0xff -; SI-NEXT: s_lshl_b32 s15, s56, 8 -; SI-NEXT: v_writelane_b32 v62, s6, 59 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s39, 0xff -; SI-NEXT: v_writelane_b32 v62, s6, 60 +; SI-NEXT: s_and_b32 s11, s60, 0xff +; SI-NEXT: s_lshl_b32 s15, s58, 8 +; SI-NEXT: s_or_b32 s16, s11, s15 +; SI-NEXT: s_and_b32 s11, s56, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s95, 24 +; SI-NEXT: s_lshl_b32 s15, s57, 24 ; SI-NEXT: s_or_b32 s56, s15, s11 -; SI-NEXT: s_and_b32 s11, s48, 0xff -; SI-NEXT: s_lshl_b32 s15, s72, 8 -; SI-NEXT: v_readlane_b32 s6, v62, 14 -; SI-NEXT: s_or_b32 s48, s11, s15 -; SI-NEXT: s_and_b32 s11, s6, 0xff +; SI-NEXT: s_and_b32 s11, s98, 0xff +; SI-NEXT: s_lshl_b32 s15, s87, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: s_or_b32 s4, s11, s15 +; SI-NEXT: v_writelane_b32 v62, s4, 51 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 16 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s31, 24 +; SI-NEXT: s_lshl_b32 s15, s4, 24 ; SI-NEXT: s_or_b32 vcc_lo, s15, s11 -; SI-NEXT: s_and_b32 s11, s86, 0xff -; SI-NEXT: s_lshl_b32 s15, s38, 8 +; SI-NEXT: s_and_b32 s11, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s74, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 15 ; SI-NEXT: s_or_b32 s72, s11, s15 -; SI-NEXT: s_and_b32 s11, s71, 0xff +; SI-NEXT: s_and_b32 s11, s4, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s81, 24 ; SI-NEXT: s_or_b32 vcc_hi, s15, s11 -; SI-NEXT: s_and_b32 s11, s58, 0xff -; SI-NEXT: s_lshl_b32 s15, s85, 8 +; SI-NEXT: s_and_b32 s11, s59, 0xff +; SI-NEXT: s_lshl_b32 s15, s86, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 14 ; SI-NEXT: s_or_b32 s57, s11, s15 -; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 13 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s74, 24 -; SI-NEXT: v_writelane_b32 v62, s74, 61 +; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s74, 52 ; SI-NEXT: s_or_b32 s74, s15, s11 -; SI-NEXT: s_and_b32 s11, s87, 0xff -; SI-NEXT: s_lshl_b32 s15, s21, 8 +; SI-NEXT: s_and_b32 s11, s6, 0xff +; SI-NEXT: s_lshl_b32 s15, s28, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 12 ; SI-NEXT: s_or_b32 s58, s11, s15 -; SI-NEXT: s_and_b32 s11, s68, 0xff +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: v_writelane_b32 v62, s75, 53 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s28, 24 +; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s6, 54 +; SI-NEXT: v_writelane_b32 v62, s23, 55 ; SI-NEXT: s_or_b32 s75, s15, s11 -; SI-NEXT: s_and_b32 s11, s24, 0xff -; SI-NEXT: s_lshl_b32 s15, s55, 8 -; SI-NEXT: v_writelane_b32 v62, s25, 62 +; SI-NEXT: s_and_b32 s11, s66, 0xff +; SI-NEXT: s_lshl_b32 s15, s69, 8 ; SI-NEXT: s_or_b32 s59, s11, s15 ; SI-NEXT: s_and_b32 s11, s37, 0xff +; SI-NEXT: v_writelane_b32 v62, s65, 56 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s51, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: s_mov_b32 s18, s21 -; SI-NEXT: s_mov_b32 s21, s97 -; SI-NEXT: s_mov_b32 s97, s37 +; SI-NEXT: s_lshl_b32 s15, s48, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: s_mov_b32 s65, s90 +; SI-NEXT: s_mov_b32 s90, s88 +; SI-NEXT: s_mov_b32 s88, s86 +; SI-NEXT: s_mov_b32 s86, s83 +; SI-NEXT: s_mov_b32 s83, s68 +; SI-NEXT: s_mov_b32 s68, s37 ; SI-NEXT: s_mov_b32 s37, s76 ; SI-NEXT: s_or_b32 s76, s15, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff -; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: s_lshl_b32 s15, s27, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 9 ; SI-NEXT: s_or_b32 s60, s11, s15 -; SI-NEXT: s_and_b32 s11, s77, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: s_and_b32 s11, s4, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_mov_b32 s6, s95 -; SI-NEXT: s_mov_b32 s95, s39 -; SI-NEXT: s_mov_b32 s39, s89 -; SI-NEXT: s_mov_b32 s89, s99 -; SI-NEXT: s_mov_b32 s99, s83 -; SI-NEXT: s_mov_b32 s83, s55 -; SI-NEXT: s_mov_b32 s55, s64 -; SI-NEXT: s_mov_b32 s64, s35 -; SI-NEXT: s_mov_b32 s35, s77 +; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_mov_b32 s36, s89 +; SI-NEXT: s_mov_b32 s89, s97 +; SI-NEXT: s_mov_b32 s97, s84 +; SI-NEXT: s_mov_b32 s84, s66 +; SI-NEXT: s_mov_b32 s66, s69 +; SI-NEXT: s_mov_b32 s69, s77 ; SI-NEXT: s_or_b32 s77, s15, s11 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: s_and_b32 s11, s82, 0xff +; SI-NEXT: s_lshl_b32 s15, s64, 8 ; SI-NEXT: s_or_b32 s61, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: v_readlane_b32 s6, v62, 7 +; SI-NEXT: s_mov_b32 s4, s22 +; SI-NEXT: s_mov_b32 s22, s81 +; SI-NEXT: s_mov_b32 s81, s82 +; SI-NEXT: s_mov_b32 s82, s64 +; SI-NEXT: s_mov_b32 s64, s50 +; SI-NEXT: s_mov_b32 s50, s38 +; SI-NEXT: s_mov_b32 s38, s78 ; SI-NEXT: s_or_b32 s78, s15, s11 -; SI-NEXT: v_readlane_b32 s11, v62, 7 -; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_and_b32 s11, s6, 0xff ; SI-NEXT: s_lshl_b32 s15, s17, 8 ; SI-NEXT: s_or_b32 s11, s11, s15 ; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 3 ; SI-NEXT: v_mov_b32_e32 v51, s9 -; SI-NEXT: s_or_b32 s17, s11, s9 -; SI-NEXT: v_readlane_b32 s9, v62, 2 -; SI-NEXT: v_readlane_b32 s11, v62, 1 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s15, s11, 8 +; SI-NEXT: s_or_b32 s11, s11, s9 +; SI-NEXT: s_and_b32 s9, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 2 +; SI-NEXT: s_lshl_b32 s15, s6, 8 ; SI-NEXT: s_or_b32 s9, s9, s15 ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_mov_b32 s4, s96 -; SI-NEXT: s_mov_b32 s96, s24 ; SI-NEXT: v_mov_b32_e32 v52, s14 -; SI-NEXT: s_or_b32 s24, s9, s14 -; SI-NEXT: s_and_b32 s14, s93, 0xff -; SI-NEXT: s_lshl_b32 s15, s84, 8 +; SI-NEXT: s_or_b32 s17, s9, s14 +; SI-NEXT: s_and_b32 s14, s96, 0xff +; SI-NEXT: s_lshl_b32 s15, s95, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v53, v6, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v50, s14, v53 -; SI-NEXT: s_and_b32 s14, s8, 0xff -; SI-NEXT: s_lshl_b32 s15, s49, 8 +; SI-NEXT: s_and_b32 s14, s49, 0xff +; SI-NEXT: s_lshl_b32 s15, s8, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v54, v14, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v17, s14, v54 -; SI-NEXT: s_and_b32 s14, s40, 0xff -; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_and_b32 s14, s79, 0xff +; SI-NEXT: s_lshl_b32 s15, s40, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: s_or_b32 s14, s14, s15 @@ -198413,8 +199106,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v55, v18, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v16, s14, v55 -; SI-NEXT: s_and_b32 s14, s34, 0xff -; SI-NEXT: s_lshl_b32 s15, s23, 8 +; SI-NEXT: s_and_b32 s14, s99, 0xff +; SI-NEXT: s_lshl_b32 s15, s31, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 ; SI-NEXT: s_or_b32 s14, s14, s15 @@ -198422,167 +199115,166 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v40, v19, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v15, s14, v40 -; SI-NEXT: s_and_b32 s14, s91, 0xff -; SI-NEXT: s_lshl_b32 s15, s66, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: s_and_b32 s14, s24, 0xff +; SI-NEXT: s_lshl_b32 s15, s25, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v41, v22, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v12, s14, v41 -; SI-NEXT: s_and_b32 s14, s50, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: s_and_b32 s14, s92, 0xff +; SI-NEXT: s_lshl_b32 s15, s73, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v42, v23, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v11, s14, v42 -; SI-NEXT: s_and_b32 s14, s73, 0xff -; SI-NEXT: s_lshl_b32 s15, s36, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: s_and_b32 s14, s53, 0xff +; SI-NEXT: s_lshl_b32 s15, s30, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v28, v59, v1 +; SI-NEXT: v_or_b32_e32 v59, v24, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v10, s14, v28 -; SI-NEXT: s_and_b32 s14, s82, 0xff -; SI-NEXT: s_lshl_b32 s15, s52, 8 +; SI-NEXT: v_or_b32_e32 v10, s14, v59 +; SI-NEXT: s_and_b32 s14, s55, 0xff +; SI-NEXT: s_lshl_b32 s15, s93, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v60, v24, v1 +; SI-NEXT: v_or_b32_e32 v60, v28, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v9, s14, v60 -; SI-NEXT: s_and_b32 s14, s90, 0xff -; SI-NEXT: s_lshl_b32 s15, s16, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: s_and_b32 s14, s52, 0xff +; SI-NEXT: s_lshl_b32 s15, s51, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v31, v44, v1 +; SI-NEXT: v_or_b32_e32 v31, v43, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v8, s14, v31 -; SI-NEXT: s_and_b32 s14, s62, 0xff -; SI-NEXT: s_lshl_b32 s15, s65, 8 +; SI-NEXT: s_and_b32 s14, s35, 0xff +; SI-NEXT: s_lshl_b32 s15, s94, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v61, v45, v1 +; SI-NEXT: v_or_b32_e32 v61, v44, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v7, s14, v61 -; SI-NEXT: s_and_b32 s14, s98, 0xff -; SI-NEXT: s_lshl_b32 s15, s67, 8 +; SI-NEXT: s_and_b32 s14, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s85, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v6, v47, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v6, v45, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v4, s14, v6 -; SI-NEXT: s_and_b32 s14, s92, 0xff -; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: s_and_b32 s14, s21, 0xff +; SI-NEXT: s_lshl_b32 s15, s71, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 55 -; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_mov_b32_e32 v22, v14 -; SI-NEXT: v_or_b32_e32 v14, v56, v1 +; SI-NEXT: v_or_b32_e32 v14, v47, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v2, s14, v14 -; SI-NEXT: s_and_b32 s14, s70, 0xff -; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: s_and_b32 s14, s54, 0xff +; SI-NEXT: s_lshl_b32 s15, s91, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 -; SI-NEXT: s_or_b32 s42, s8, s63 -; SI-NEXT: v_readlane_b32 s8, v62, 56 +; SI-NEXT: v_readlane_b32 s6, v62, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 46 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v32, v23 ; SI-NEXT: v_mov_b32_e32 v23, v18 ; SI-NEXT: v_or_b32_e32 v18, v57, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s40, s8, s5 -; SI-NEXT: v_readlane_b32 s8, v62, 57 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 47 ; SI-NEXT: v_or_b32_e32 v1, s14, v18 -; SI-NEXT: s_and_b32 s14, s88, 0xff -; SI-NEXT: s_lshl_b32 s15, s79, 8 +; SI-NEXT: s_and_b32 s14, s7, 0xff ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_readlane_b32 s9, v62, 60 +; SI-NEXT: s_or_b32 s42, s8, s63 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 48 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s15, s8, s41 -; SI-NEXT: v_readlane_b32 s8, v62, 58 -; SI-NEXT: s_and_b32 s16, s9, 0xffff -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v26, v24 +; SI-NEXT: s_or_b32 s40, s8, s62 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 49 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v24, v19 ; SI-NEXT: v_or_b32_e32 v19, v58, v3 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s36, s16, s56 -; SI-NEXT: s_and_b32 s16, s48, 0xffff +; SI-NEXT: s_or_b32 s15, s8, s41 +; SI-NEXT: s_and_b32 s8, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v3, s14, v19 ; SI-NEXT: s_or_b32 s14, s8, s43 ; SI-NEXT: s_and_b32 s8, s13, 0xffff -; SI-NEXT: s_or_b32 s53, s16, vcc_lo -; SI-NEXT: s_and_b32 s16, s72, 0xffff ; SI-NEXT: s_or_b32 s13, s8, s44 ; SI-NEXT: s_and_b32 s8, s12, 0xffff -; SI-NEXT: s_or_b32 s94, s16, vcc_hi -; SI-NEXT: s_and_b32 s16, s57, 0xffff ; SI-NEXT: s_or_b32 s12, s8, s45 ; SI-NEXT: s_and_b32 s8, s10, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 50 +; SI-NEXT: s_or_b32 s10, s8, s46 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 51 +; SI-NEXT: s_or_b32 s31, s16, s56 +; SI-NEXT: s_and_b32 s16, s6, 0xffff +; SI-NEXT: s_or_b32 s52, s16, vcc_lo +; SI-NEXT: s_and_b32 s16, s72, 0xffff +; SI-NEXT: s_or_b32 s51, s16, vcc_hi +; SI-NEXT: s_and_b32 s16, s57, 0xffff ; SI-NEXT: s_or_b32 s49, s16, s74 ; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_or_b32 s10, s8, s46 -; SI-NEXT: v_readlane_b32 s8, v62, 59 -; SI-NEXT: s_or_b32 s48, s16, s75 +; SI-NEXT: s_or_b32 s9, s16, s75 ; SI-NEXT: s_and_b32 s16, s59, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_mov_b32 s25, s11 ; SI-NEXT: s_or_b32 s11, s16, s76 ; SI-NEXT: s_and_b32 s16, s60, 0xffff ; SI-NEXT: s_and_b32 s23, s61, 0xffff -; SI-NEXT: s_mov_b32 s30, s87 -; SI-NEXT: s_mov_b32 s87, s85 +; SI-NEXT: s_mov_b32 s5, s28 +; SI-NEXT: s_mov_b32 s28, s18 ; SI-NEXT: s_or_b32 s8, s8, s47 -; SI-NEXT: s_or_b32 s9, s16, s77 -; SI-NEXT: s_or_b32 s16, s23, s78 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v30, v37 +; SI-NEXT: s_or_b32 s16, s16, s77 +; SI-NEXT: s_or_b32 s24, s23, s78 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v20, v44 ; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v25, v58 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v30, v57 +; SI-NEXT: v_mov_b32_e32 v49, v58 ; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 ; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 ; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 ; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 ; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 ; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16 -; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16 -; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16 +; SI-NEXT: v_alignbit_b32 v42, s31, v59, 16 +; SI-NEXT: v_alignbit_b32 v41, s52, v60, 16 +; SI-NEXT: v_alignbit_b32 v40, s51, v31, 16 ; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16 -; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, s9, v6, 16 ; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16 ; SI-NEXT: v_mov_b32_e32 v14, v22 -; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16 +; SI-NEXT: v_alignbit_b32 v52, s16, v18, 16 ; SI-NEXT: v_mov_b32_e32 v18, v23 -; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16 +; SI-NEXT: v_alignbit_b32 v51, s24, v19, 16 ; SI-NEXT: v_mov_b32_e32 v19, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 ; SI-NEXT: s_lshr_b32 s73, s63, 16 -; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s72, s62, 16 ; SI-NEXT: s_lshr_b32 s63, s41, 16 ; SI-NEXT: s_lshr_b32 s62, s43, 16 ; SI-NEXT: s_lshr_b32 s61, s44, 16 @@ -198593,191 +199285,202 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 ; SI-NEXT: s_lshr_b32 s46, s74, 16 -; SI-NEXT: v_readlane_b32 s25, v62, 62 -; SI-NEXT: v_readlane_b32 s74, v62, 61 +; SI-NEXT: v_readlane_b32 s74, v62, 52 ; SI-NEXT: s_lshr_b32 s45, s75, 16 +; SI-NEXT: v_readlane_b32 s23, v62, 55 +; SI-NEXT: v_readlane_b32 s6, v62, 54 +; SI-NEXT: v_readlane_b32 s75, v62, 53 ; SI-NEXT: s_lshr_b32 s44, s76, 16 ; SI-NEXT: s_mov_b32 s76, s37 -; SI-NEXT: s_mov_b32 s37, s97 -; SI-NEXT: s_mov_b32 s97, s21 -; SI-NEXT: s_mov_b32 s21, s18 -; SI-NEXT: s_mov_b32 s18, s17 -; SI-NEXT: s_mov_b32 s85, s87 -; SI-NEXT: s_mov_b32 s87, s30 -; SI-NEXT: s_mov_b32 s17, s24 +; SI-NEXT: s_mov_b32 s37, s68 +; SI-NEXT: s_mov_b32 s68, s83 +; SI-NEXT: s_mov_b32 s83, s86 +; SI-NEXT: s_mov_b32 s86, s88 +; SI-NEXT: s_mov_b32 s88, s90 +; SI-NEXT: s_mov_b32 s90, s65 +; SI-NEXT: v_readlane_b32 s65, v62, 56 ; SI-NEXT: s_lshr_b32 s43, s77, 16 -; SI-NEXT: s_mov_b32 s77, s35 -; SI-NEXT: s_mov_b32 s35, s64 -; SI-NEXT: s_mov_b32 s64, s55 -; SI-NEXT: s_mov_b32 s55, s83 -; SI-NEXT: s_mov_b32 s83, s99 -; SI-NEXT: s_mov_b32 s99, s89 -; SI-NEXT: s_mov_b32 s89, s39 -; SI-NEXT: s_mov_b32 s39, s95 -; SI-NEXT: s_mov_b32 s95, s6 +; SI-NEXT: s_mov_b32 s77, s69 +; SI-NEXT: s_mov_b32 s69, s66 +; SI-NEXT: s_mov_b32 s66, s84 +; SI-NEXT: s_mov_b32 s84, s97 +; SI-NEXT: s_mov_b32 s97, s89 +; SI-NEXT: s_mov_b32 s89, s36 +; SI-NEXT: s_mov_b32 s18, s28 +; SI-NEXT: s_mov_b32 s28, s5 ; SI-NEXT: s_lshr_b32 s41, s78, 16 -; SI-NEXT: s_mov_b32 s24, s96 -; SI-NEXT: s_mov_b32 s96, s4 +; SI-NEXT: s_mov_b32 s78, s38 +; SI-NEXT: s_mov_b32 s38, s50 +; SI-NEXT: s_mov_b32 s50, s64 +; SI-NEXT: s_mov_b32 s64, s82 +; SI-NEXT: s_mov_b32 s82, s81 +; SI-NEXT: s_mov_b32 s81, s22 +; SI-NEXT: s_mov_b32 s22, s4 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_mov_b32_e32 v6, v5 ; SI-NEXT: v_mov_b32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_add_i32 s4, s88, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_add_i32 s4, s7, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 10 -; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: s_add_i32 s4, s82, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_add_i32 s8, s38, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s50, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s70, s70, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 30 +; SI-NEXT: s_add_i32 s54, s54, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s8, s91, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v2, s5, v2 -; SI-NEXT: s_add_i32 s5, s35, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: v_readlane_b32 s5, v62, 10 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 9 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: s_add_i32 s9, s77, 3 +; SI-NEXT: s_lshl_b32 s8, s27, 8 +; SI-NEXT: s_add_i32 s9, s7, 3 ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_readlane_b32 s6, v62, 12 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 24 +; SI-NEXT: s_lshl_b32 s8, s78, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_add_i32 s79, s92, 3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 +; SI-NEXT: s_add_i32 s24, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s8, s4 ; SI-NEXT: s_add_i32 s16, s4, 0x3000000 +; SI-NEXT: s_add_i32 s4, s21, 3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 -; SI-NEXT: s_add_i32 s9, s5, 0x3000000 -; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s4, s24, 3 +; SI-NEXT: s_add_i32 s4, s66, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_lshl_b32 s5, s69, 8 ; SI-NEXT: s_add_i32 s8, s37, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s51, 24 +; SI-NEXT: s_lshl_b32 s5, s48, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v5, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s52, s98, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 ; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: s_add_i32 s30, s87, 3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: s_add_i32 s37, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 12 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s8, s68, 3 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 11 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s28, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s48, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 42 -; SI-NEXT: v_mov_b32_e32 v22, v30 -; SI-NEXT: s_add_i32 s87, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 39 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: s_add_i32 s85, s35, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v37 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: v_readlane_b32 s4, v62, 32 -; SI-NEXT: s_add_i32 s67, s4, 3 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s85, 8 -; SI-NEXT: s_add_i32 s8, s69, 3 +; SI-NEXT: s_add_i32 s64, s4, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 14 +; SI-NEXT: s_and_b32 s4, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 13 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s74, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s50, s90, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: s_add_i32 s49, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 27 +; SI-NEXT: v_mov_b32_e32 v22, v28 +; SI-NEXT: s_add_i32 s50, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 24 ; SI-NEXT: s_and_b32 s4, s50, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s94, s86, 3 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: s_add_i32 s92, s18, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 15 ; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 -; SI-NEXT: s_add_i32 s8, s71, 3 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -198786,77 +199489,112 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s94, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 52 +; SI-NEXT: s_add_i32 s51, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 43 ; SI-NEXT: s_add_i32 s18, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 51 +; SI-NEXT: v_readlane_b32 s5, v62, 42 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_readlane_b32 s4, v62, 45 -; SI-NEXT: s_add_i32 s98, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 43 -; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s31, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s53, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 41 -; SI-NEXT: s_add_i32 s86, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 38 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s87, s53, 3 +; SI-NEXT: s_add_i32 s52, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v10, v59, v10 +; SI-NEXT: v_readlane_b32 s5, v62, 29 +; SI-NEXT: v_readlane_b32 s6, v62, 25 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s48, s6, 3 +; SI-NEXT: s_and_b32 s8, s48, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s86, s39, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 36 +; SI-NEXT: s_add_i32 s69, s6, 3 +; SI-NEXT: s_and_b32 s7, s69, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s35, s23, 3 +; SI-NEXT: s_and_b32 s6, s35, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s99, s99, 3 +; SI-NEXT: s_add_i32 s81, s90, 3 +; SI-NEXT: s_add_i32 s71, s79, 3 +; SI-NEXT: s_add_i32 s38, s75, 3 +; SI-NEXT: s_add_i32 s93, s97, 3 +; SI-NEXT: s_add_i32 s91, s96, 3 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v39, s16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_mov_b32_e32 v25, s51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: v_mov_b32_e32 v24, s52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: v_readlane_b32 s4, v62, 31 +; SI-NEXT: v_readlane_b32 s4, v62, 34 ; SI-NEXT: s_add_i32 s66, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 27 ; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s37, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s37, 0xff +; SI-NEXT: v_readlane_b32 s5, v62, 26 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s95, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s36, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 50 +; SI-NEXT: s_add_i32 s31, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 41 ; SI-NEXT: s_add_i32 s21, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 49 +; SI-NEXT: v_readlane_b32 s5, v62, 39 ; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 37 -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 +; SI-NEXT: v_mov_b32_e32 v23, s31 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 +; SI-NEXT: s_lshr_b32 s57, s31, 16 +; SI-NEXT: s_lshr_b32 s56, s52, 16 +; SI-NEXT: s_lshr_b32 s47, s51, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s71, s22, 3 -; SI-NEXT: s_and_b32 s8, s71, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s35, s99, 3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload @@ -198864,63 +199602,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: v_readlane_b32 s4, v62, 40 -; SI-NEXT: s_add_i32 s85, s4, 3 -; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_and_b32 s4, s86, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 33 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s22, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 54 +; SI-NEXT: v_readlane_b32 s4, v62, 45 ; SI-NEXT: s_add_i32 s17, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 53 +; SI-NEXT: v_readlane_b32 s5, v62, 44 ; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 47 +; SI-NEXT: v_readlane_b32 s5, v62, 38 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_and_b32 s6, s35, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v39, s9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 -; SI-NEXT: v_mov_b32_e32 v27, s48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; SI-NEXT: v_mov_b32_e32 v26, s49 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; SI-NEXT: v_mov_b32_e32 v24, s53 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 -; SI-NEXT: v_mov_b32_e32 v23, s36 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: v_mov_b32_e32 v22, s8 ; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 -; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 -; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 -; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 -; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 ; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s57, s36, 16 -; SI-NEXT: s_lshr_b32 s56, s53, 16 -; SI-NEXT: s_lshr_b32 s47, s94, 16 -; SI-NEXT: s_lshr_b32 s46, s49, 16 -; SI-NEXT: s_lshr_b32 s45, s48, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s9, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -198931,19 +199634,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 48 -; SI-NEXT: s_add_i32 s7, s4, 3 -; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 40 +; SI-NEXT: s_add_i32 s22, s4, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s89, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 37 ; SI-NEXT: s_add_i32 s10, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 46 -; SI-NEXT: s_add_i32 s99, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 44 ; SI-NEXT: s_and_b32 s4, s99, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -198952,29 +199654,24 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_readlane_b32 s6, v62, 33 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 36 -; SI-NEXT: s_add_i32 s81, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 35 -; SI-NEXT: v_readlane_b32 s6, v62, 28 ; SI-NEXT: s_and_b32 s4, s81, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s55, s6, 3 +; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: s_add_i32 s65, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 26 -; SI-NEXT: s_and_b32 s6, s55, 0xff +; SI-NEXT: v_readlane_b32 s5, v62, 31 +; SI-NEXT: s_and_b32 s6, s65, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s12, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 34 -; SI-NEXT: s_add_i32 s69, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 29 +; SI-NEXT: v_readlane_b32 s5, v62, 35 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 -; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s71, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -198983,28 +199680,24 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_readlane_b32 s5, v62, 22 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 22 -; SI-NEXT: s_add_i32 s34, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 21 -; SI-NEXT: v_readlane_b32 s6, v62, 19 -; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_and_b32 s4, s38, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s92, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s92, 0xff +; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s97, 24 +; SI-NEXT: s_lshl_b32 s5, s89, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s13, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 25 -; SI-NEXT: s_add_i32 s51, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 30 +; SI-NEXT: s_add_i32 s53, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 28 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 -; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_and_b32 s4, s53, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -199014,24 +199707,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v14, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 20 -; SI-NEXT: s_add_i32 s95, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 18 -; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 21 +; SI-NEXT: s_add_i32 s30, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 20 +; SI-NEXT: s_and_b32 s4, s30, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s96, 3 +; SI-NEXT: s_add_i32 s88, s34, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_and_b32 s6, s88, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s80, 24 +; SI-NEXT: s_lshl_b32 s5, s83, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s14, s4, 0x3000000 -; SI-NEXT: s_add_i32 s4, s93, 3 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s91, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199040,30 +199732,30 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_add_i32 s4, s83, 3 +; SI-NEXT: s_add_i32 s4, s84, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s6, s64, 3 +; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_add_i32 s6, s77, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 15 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s68, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: v_readlane_b32 s4, v62, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: v_readlane_b32 s5, v62, 2 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s26, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -199071,14 +199763,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s17, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 18 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s76, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s67, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -199086,11 +199778,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s40, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 7 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 17 -; SI-NEXT: v_readlane_b32 s6, v62, 6 +; SI-NEXT: v_readlane_b32 s5, v62, 19 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s6, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199099,15 +199790,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s18, s4, 0x3000000 +; SI-NEXT: s_add_i32 s25, s4, 0x3000000 ; SI-NEXT: s_add_i32 s4, s20, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_readlane_b32 s6, v62, 4 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: v_readlane_b32 s6, v62, 5 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 4 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 @@ -199116,7 +199807,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s42, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v13, s25 ; SI-NEXT: v_mov_b32_e32 v20, s10 ; SI-NEXT: v_mov_b32_e32 v19, s12 ; SI-NEXT: v_mov_b32_e32 v18, s13 @@ -199129,10 +199820,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 ; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 ; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 ; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 ; SI-NEXT: s_lshr_b32 s73, s42, 16 ; SI-NEXT: s_lshr_b32 s72, s40, 16 @@ -199142,7 +199831,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s60, s12, 16 ; SI-NEXT: s_lshr_b32 s59, s10, 16 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_and_b32 s4, s25, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: s_and_b32 s4, s42, 0xffff @@ -199247,7 +199936,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 -; SI-NEXT: s_and_b32 s4, s36, 0xffff +; SI-NEXT: s_and_b32 s4, s31, 0xffff ; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 @@ -199260,7 +199949,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 -; SI-NEXT: s_and_b32 s4, s53, 0xffff +; SI-NEXT: s_and_b32 s4, s52, 0xffff ; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 @@ -199273,7 +199962,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: s_and_b32 s4, s94, 0xffff +; SI-NEXT: s_and_b32 s4, s51, 0xffff ; SI-NEXT: s_lshl_b32 s5, s47, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 @@ -199298,7 +199987,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; SI-NEXT: s_and_b32 s4, s48, 0xffff +; SI-NEXT: s_and_b32 s4, s9, 0xffff ; SI-NEXT: s_lshl_b32 s5, s45, 16 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 @@ -199322,7 +200011,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 @@ -199335,7 +200024,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s41, 16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 @@ -199396,8 +200085,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -199405,18 +200094,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v5, v13 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v58 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: v_mov_b32_e32 v30, v57 +; SI-NEXT: v_mov_b32_e32 v48, v47 ; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v20, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v36 ; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr73 @@ -199450,15 +200138,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -199466,7 +200154,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -199474,11 +200162,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: s_branch .LBB97_2 ; @@ -201650,29 +202338,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:132 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36 @@ -201704,16 +202392,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -201769,7 +202457,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 @@ -201786,6 +202474,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -201794,115 +202485,139 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -201910,40 +202625,40 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -202084,112 +202799,112 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -202203,7 +202918,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -202212,7 +202927,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -202310,32 +203025,32 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 @@ -202474,31 +203189,31 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:132 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 @@ -202530,16 +203245,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 @@ -202595,7 +203310,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 @@ -202612,6 +203327,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff @@ -202620,115 +203338,139 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v133 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v102 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v112 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v145 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 @@ -202736,40 +203478,40 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 @@ -202777,7 +203519,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 @@ -202905,117 +203647,117 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v179 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v177 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v161 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v147 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v132 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v150 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v149 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v145 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v178, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v130 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v128 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v148, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v133, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 @@ -203029,7 +203771,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v129, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 @@ -203038,7 +203780,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v135, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 @@ -203136,33 +203878,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v145, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v118, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v134, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v161, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 @@ -208002,482 +208744,482 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] ; GFX11-FAKE16-NEXT: .LBB98_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB98_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v61 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v73 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v55, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v65, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v101 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -208499,30 +209241,30 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v41 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v181 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v167 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -208544,29 +209286,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v147 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v144 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v131 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v119 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -208589,31 +209331,31 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v102 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v70 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -208751,673 +209493,682 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s80, 24 ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v62, s18, 0 ; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: s_mov_b32 s6, s22 +; SI-NEXT: v_writelane_b32 v62, s17, 1 ; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v62, s6, 2 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v62, s16, 3 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: s_mov_b32 s7, s26 +; SI-NEXT: v_writelane_b32 v62, s21, 4 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: s_mov_b32 s8, s20 +; SI-NEXT: v_writelane_b32 v62, s7, 5 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v62, s8, 6 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: s_mov_b32 s6, s18 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s9, s24 +; SI-NEXT: v_writelane_b32 v62, s25, 7 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s62, v30 -; SI-NEXT: v_readfirstlane_b32 s63, v29 -; SI-NEXT: v_readfirstlane_b32 s59, v26 -; SI-NEXT: v_readfirstlane_b32 s60, v25 -; SI-NEXT: v_readfirstlane_b32 s98, v22 -; SI-NEXT: v_readfirstlane_b32 s61, v21 -; SI-NEXT: v_readfirstlane_b32 s99, v18 -; SI-NEXT: v_readfirstlane_b32 s58, v17 -; SI-NEXT: v_readfirstlane_b32 s96, v14 -; SI-NEXT: v_readfirstlane_b32 s97, v13 -; SI-NEXT: v_readfirstlane_b32 s86, v10 -; SI-NEXT: v_readfirstlane_b32 s87, v9 -; SI-NEXT: v_readfirstlane_b32 s84, v6 -; SI-NEXT: v_readfirstlane_b32 s85, v5 -; SI-NEXT: v_readfirstlane_b32 s81, v2 -; SI-NEXT: v_readfirstlane_b32 s82, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_readfirstlane_b32 s98, v30 +; SI-NEXT: v_readfirstlane_b32 s99, v29 +; SI-NEXT: v_readfirstlane_b32 s84, v26 +; SI-NEXT: v_readfirstlane_b32 s85, v25 +; SI-NEXT: v_readfirstlane_b32 s81, v22 +; SI-NEXT: v_readfirstlane_b32 s97, v21 +; SI-NEXT: v_readfirstlane_b32 s82, v18 +; SI-NEXT: v_readfirstlane_b32 s83, v17 +; SI-NEXT: v_readfirstlane_b32 s68, v14 +; SI-NEXT: v_readfirstlane_b32 s70, v13 +; SI-NEXT: v_readfirstlane_b32 s66, v10 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s88, v36 -; SI-NEXT: v_readfirstlane_b32 s18, v37 -; SI-NEXT: v_readfirstlane_b32 s78, v38 -; SI-NEXT: v_readfirstlane_b32 s79, v39 -; SI-NEXT: v_readfirstlane_b32 s76, v48 -; SI-NEXT: v_readfirstlane_b32 s77, v49 -; SI-NEXT: v_readfirstlane_b32 s74, v50 +; SI-NEXT: v_readfirstlane_b32 s74, v36 +; SI-NEXT: v_readfirstlane_b32 s24, v37 +; SI-NEXT: v_readfirstlane_b32 s72, v38 +; SI-NEXT: v_readfirstlane_b32 s20, v39 +; SI-NEXT: v_readfirstlane_b32 s63, v48 +; SI-NEXT: v_readfirstlane_b32 s26, v49 +; SI-NEXT: v_readfirstlane_b32 s73, v50 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s75, v51 +; SI-NEXT: v_readfirstlane_b32 s62, v51 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s72, v52 +; SI-NEXT: v_readfirstlane_b32 s61, v52 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s73, v53 +; SI-NEXT: v_readfirstlane_b32 s22, v53 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; SI-NEXT: v_readfirstlane_b32 s67, v9 +; SI-NEXT: v_readfirstlane_b32 s53, v6 +; SI-NEXT: v_readfirstlane_b32 s54, v5 +; SI-NEXT: v_readfirstlane_b32 s50, v2 +; SI-NEXT: v_readfirstlane_b32 s52, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v41 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_writelane_b32 v62, s9, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s56 -; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: s_and_b32 s4, s8, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 -; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 -; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, s58, v1, 24 ; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s46 -; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_alignbit_b32 v9, s58, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s58, v1, 8 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s9, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 -; SI-NEXT: s_or_b32 s44, s4, s5 -; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s47 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s7, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 -; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_alignbit_b32 v9, s46, v1, 24 +; SI-NEXT: s_or_b32 s44, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 -; SI-NEXT: s_or_b32 s42, s4, s5 -; SI-NEXT: s_and_b32 s4, s82, 0xffff -; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 +; SI-NEXT: v_alignbit_b32 v9, s46, v1, 16 +; SI-NEXT: v_alignbit_b32 v49, s46, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s45 ; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s42 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s50, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, s44, v1, 24 +; SI-NEXT: v_alignbit_b32 v48, s44, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, s44, v1, 8 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s43 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v9, s42, v1, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 +; SI-NEXT: v_alignbit_b32 v9, s42, v1, 16 +; SI-NEXT: v_alignbit_b32 v50, s42, v1, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 ; SI-NEXT: v_or_b32_e32 v16, v1, v2 ; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: v_alignbit_b32 v1, s41, v16, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, s41, v16, 16 -; SI-NEXT: s_and_b32 s4, s87, 0xffff -; SI-NEXT: s_lshl_b32 s5, s86, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s41, v16, 8 -; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 -; SI-NEXT: v_or_b32_e32 v14, v1, v4 -; SI-NEXT: s_or_b32 s14, s4, s5 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 24 -; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: v_or_b32_e32 v13, v1, v4 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: v_alignbit_b32 v1, s40, v13, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 16 -; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: v_alignbit_b32 v1, s40, v13, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 8 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_alignbit_b32 v1, s40, v13, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s75, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 ; SI-NEXT: v_or_b32_e32 v12, v1, v5 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s77, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 ; SI-NEXT: v_alignbit_b32 v1, s15, v12, 24 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, s15, v12, 16 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, s15, v12, 8 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_writelane_b32 v62, s4, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_or_b32_e32 v10, v1, v6 -; SI-NEXT: v_writelane_b32 v62, s4, 3 -; SI-NEXT: s_lshr_b32 s4, s9, 8 -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v1, v6 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: v_alignbit_b32 v1, s14, v9, 24 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s62, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: v_alignbit_b32 v1, s14, v9, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: v_alignbit_b32 v1, s14, v9, 8 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: s_and_b32 s4, s72, 0xffff -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v1, v9 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: v_or_b32_e32 v6, v1, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: s_and_b32 s4, s74, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v1, v13 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_or_b32_e32 v5, v1, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_writelane_b32 v62, s4, 5 -; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: v_or_b32_e32 v13, v1, v17 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_or_b32_e32 v2, v1, v14 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: s_and_b32 s4, s78, 0xffff -; SI-NEXT: v_mov_b32_e32 v26, v9 -; SI-NEXT: v_or_b32_e32 v9, v1, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: s_and_b32 s4, s88, 0xffff -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_or_b32_e32 v6, v1, v20 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: v_mov_b32_e32 v25, v14 +; SI-NEXT: v_or_b32_e32 v14, v1, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_or_b32_e32 v10, v1, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 -; SI-NEXT: v_or_b32_e32 v4, v1, v21 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: s_bfe_u32 s4, s76, 0x80008 -; SI-NEXT: v_or_b32_e32 v2, v1, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_writelane_b32 v62, s4, 7 -; SI-NEXT: s_bfe_u32 s4, s78, 0x80008 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: s_and_b32 s4, s72, 0xffff +; SI-NEXT: v_or_b32_e32 v4, v1, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 +; SI-NEXT: v_alignbit_b32 v39, s41, v16, 8 +; SI-NEXT: v_mov_b32_e32 v28, v17 +; SI-NEXT: v_mov_b32_e32 v26, v18 ; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: v_mov_b32_e32 v37, v21 ; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: s_lshr_b32 s68, s57, 8 -; SI-NEXT: s_lshr_b32 s65, s47, 8 -; SI-NEXT: s_lshr_b32 s54, s45, 8 -; SI-NEXT: s_lshr_b32 s51, s43, 8 -; SI-NEXT: s_lshr_b32 s48, s41, 8 -; SI-NEXT: s_lshr_b32 s37, s40, 8 -; SI-NEXT: s_lshr_b32 s34, s15, 8 -; SI-NEXT: s_lshr_b32 s95, s14, 8 -; SI-NEXT: s_lshr_b32 s92, s13, 8 +; SI-NEXT: s_lshr_b32 s60, s58, 8 +; SI-NEXT: s_lshr_b32 s96, s46, 8 +; SI-NEXT: s_lshr_b32 s80, s44, 8 +; SI-NEXT: s_lshr_b32 s65, s42, 8 +; SI-NEXT: s_lshr_b32 s51, s41, 8 +; SI-NEXT: s_lshr_b32 s39, s40, 8 +; SI-NEXT: s_lshr_b32 s36, s15, 8 +; SI-NEXT: s_lshr_b32 s31, s14, 8 +; SI-NEXT: s_lshr_b32 s93, s13, 8 ; SI-NEXT: s_lshr_b32 s89, s12, 8 -; SI-NEXT: s_and_b32 s71, s19, 0xffff -; SI-NEXT: s_and_b32 s69, s23, 0xffff -; SI-NEXT: s_and_b32 s66, s27, 0xffff -; SI-NEXT: s_and_b32 s55, s81, 0xffff -; SI-NEXT: s_and_b32 s52, s84, 0xffff -; SI-NEXT: s_and_b32 s49, s86, 0xffff -; SI-NEXT: s_and_b32 s38, s96, 0xffff -; SI-NEXT: s_and_b32 s35, s99, 0xffff -; SI-NEXT: s_and_b32 s30, s98, 0xffff -; SI-NEXT: s_and_b32 s93, s59, 0xffff -; SI-NEXT: s_and_b32 s90, s62, 0xffff -; SI-NEXT: s_bfe_u32 s83, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s80, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s70, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s67, s81, 0x80008 -; SI-NEXT: s_bfe_u32 s64, s84, 0x80008 -; SI-NEXT: s_bfe_u32 s53, s86, 0x80008 -; SI-NEXT: s_bfe_u32 s50, s96, 0x80008 -; SI-NEXT: s_bfe_u32 s39, s99, 0x80008 -; SI-NEXT: s_bfe_u32 s36, s98, 0x80008 -; SI-NEXT: s_bfe_u32 s31, s59, 0x80008 -; SI-NEXT: s_bfe_u32 s94, s62, 0x80008 -; SI-NEXT: s_bfe_u32 s91, s72, 0x80008 -; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: v_alignbit_b32 v45, s13, v8, 24 -; SI-NEXT: v_alignbit_b32 v47, s13, v8, 16 -; SI-NEXT: v_alignbit_b32 v57, s13, v8, 8 -; SI-NEXT: v_alignbit_b32 v41, s12, v5, 24 -; SI-NEXT: v_alignbit_b32 v43, s12, v5, 16 -; SI-NEXT: v_alignbit_b32 v44, s12, v5, 8 -; SI-NEXT: v_alignbit_b32 v21, s11, v13, 24 -; SI-NEXT: v_alignbit_b32 v22, s11, v13, 16 -; SI-NEXT: v_alignbit_b32 v24, s11, v13, 8 -; SI-NEXT: v_alignbit_b32 v17, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v18, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v20, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v59, s9, v6, 24 -; SI-NEXT: v_alignbit_b32 v60, s9, v6, 16 -; SI-NEXT: v_alignbit_b32 v61, s9, v6, 8 -; SI-NEXT: v_alignbit_b32 v46, s8, v4, 24 -; SI-NEXT: v_alignbit_b32 v56, s8, v4, 16 -; SI-NEXT: v_alignbit_b32 v58, s8, v4, 8 -; SI-NEXT: v_alignbit_b32 v55, s7, v2, 24 -; SI-NEXT: v_alignbit_b32 v40, s7, v2, 16 -; SI-NEXT: v_alignbit_b32 v42, s7, v2, 8 -; SI-NEXT: v_alignbit_b32 v52, s6, v1, 24 -; SI-NEXT: v_alignbit_b32 v53, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 +; SI-NEXT: s_lshr_b32 s78, s11, 8 +; SI-NEXT: s_lshr_b32 s75, s10, 8 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_and_b32 s56, s23, 0xffff +; SI-NEXT: s_and_b32 s86, s27, 0xffff +; SI-NEXT: s_and_b32 s71, s50, 0xffff +; SI-NEXT: s_and_b32 s64, s53, 0xffff +; SI-NEXT: s_and_b32 s49, s66, 0xffff +; SI-NEXT: s_and_b32 s38, s68, 0xffff +; SI-NEXT: s_and_b32 s35, s82, 0xffff +; SI-NEXT: s_and_b32 s30, s81, 0xffff +; SI-NEXT: s_and_b32 s92, s84, 0xffff +; SI-NEXT: s_and_b32 s90, s98, 0xffff +; SI-NEXT: s_and_b32 s79, s61, 0xffff +; SI-NEXT: s_and_b32 s76, s73, 0xffff +; SI-NEXT: s_bfe_u32 s21, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s17, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s87, s50, 0x80008 +; SI-NEXT: s_bfe_u32 s69, s53, 0x80008 +; SI-NEXT: s_bfe_u32 s55, s66, 0x80008 +; SI-NEXT: s_bfe_u32 s48, s68, 0x80008 +; SI-NEXT: s_bfe_u32 s37, s82, 0x80008 +; SI-NEXT: s_bfe_u32 s34, s81, 0x80008 +; SI-NEXT: s_bfe_u32 s95, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s94, s98, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s61, 0x80008 +; SI-NEXT: s_bfe_u32 s88, s73, 0x80008 +; SI-NEXT: s_bfe_u32 s77, s63, 0x80008 +; SI-NEXT: s_bfe_u32 s25, s72, 0x80008 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_alignbit_b32 v55, s13, v6, 24 +; SI-NEXT: v_alignbit_b32 v40, s13, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, s13, v6, 8 +; SI-NEXT: v_alignbit_b32 v52, s12, v5, 24 +; SI-NEXT: v_alignbit_b32 v53, s12, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, s12, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, s11, v2, 24 +; SI-NEXT: v_alignbit_b32 v22, s11, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, s11, v2, 8 +; SI-NEXT: v_alignbit_b32 v17, s10, v14, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, s10, v14, 8 +; SI-NEXT: v_alignbit_b32 v59, s9, v10, 24 +; SI-NEXT: v_alignbit_b32 v60, s9, v10, 16 +; SI-NEXT: v_alignbit_b32 v61, s9, v10, 8 +; SI-NEXT: v_alignbit_b32 v56, s8, v8, 24 +; SI-NEXT: v_alignbit_b32 v57, s8, v8, 16 +; SI-NEXT: v_alignbit_b32 v58, s8, v8, 8 +; SI-NEXT: v_alignbit_b32 v45, s7, v4, 24 +; SI-NEXT: v_alignbit_b32 v46, s7, v4, 16 +; SI-NEXT: v_alignbit_b32 v47, s7, v4, 8 +; SI-NEXT: v_alignbit_b32 v42, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v44, s6, v1, 8 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s77, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s62, s62, 3 ; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s75, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_and_b32 s4, s62, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s9, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s85, s85, 3 ; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s60, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s97, s97, 3 ; SI-NEXT: s_add_i32 s12, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_add_i32 s13, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 ; SI-NEXT: s_add_i32 s14, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s67, s67, 3 ; SI-NEXT: s_add_i32 s15, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s87, 0xffff -; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s54, s54, 3 ; SI-NEXT: s_add_i32 s40, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s41, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s82, s82, 3 -; SI-NEXT: s_add_i32 s42, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s82, 0xffff -; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_add_i32 s43, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s50, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s44, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s45, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s46, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: v_readlane_b32 s4, v62, 6 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s47, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s57, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v21, s56 -; SI-NEXT: v_alignbit_b32 v22, s57, v21, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s57, v21, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v21, s57 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_alignbit_b32 v22, s58, v21, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, s46 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, s58, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s58, v21, 8 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 -; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: v_mov_b32_e32 v21, s47 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 -; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s44 -; SI-NEXT: v_writelane_b32 v62, s4, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_alignbit_b32 v22, s46, v21, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s42 -; SI-NEXT: v_writelane_b32 v62, s4, 3 -; SI-NEXT: s_lshr_b32 s4, s9, 24 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_alignbit_b32 v22, s46, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, s46, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s45 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 -; SI-NEXT: v_mov_b32_e32 v3, s41 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, s44, v21, 24 +; SI-NEXT: v_alignbit_b32 v48, s44, v21, 16 +; SI-NEXT: v_alignbit_b32 v51, s44, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 -; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 5 -; SI-NEXT: s_lshr_b32 s4, s9, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_alignbit_b32 v22, s42, v21, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s42, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v7 ; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_alignbit_b32 v50, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v21, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: s_lshr_b32 s4, s8, 24 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 7 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 +; SI-NEXT: v_alignbit_b32 v39, v3, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v7, v13, 24 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: s_lshr_b32 s4, s8, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v15 +; SI-NEXT: v_alignbit_b32 v3, v7, v13, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v11 ; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: s_lshr_b32 s4, s7, 24 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v3, v7, v13, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s4, 11 ; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v19 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: s_lshr_b32 s4, s7, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 ; SI-NEXT: v_mov_b32_e32 v15, s14 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s4, 13 ; SI-NEXT: s_lshr_b32 s4, s6, 24 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_alignbit_b32 v3, v15, v9, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_mov_b32_e32 v35, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 ; SI-NEXT: v_mov_b32_e32 v34, s7 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 ; SI-NEXT: v_mov_b32_e32 v33, s8 ; SI-NEXT: v_mov_b32_e32 v32, s9 ; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 ; SI-NEXT: v_mov_b32_e32 v17, s11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 ; SI-NEXT: v_mov_b32_e32 v18, s12 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 ; SI-NEXT: v_mov_b32_e32 v19, s13 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_alignbit_b32 v3, v15, v9, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 15 ; SI-NEXT: s_lshr_b32 s4, s6, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 8 -; SI-NEXT: v_alignbit_b32 v45, v19, v8, 24 -; SI-NEXT: v_alignbit_b32 v47, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v57, v19, v8, 8 -; SI-NEXT: v_alignbit_b32 v41, v18, v5, 24 -; SI-NEXT: v_alignbit_b32 v43, v18, v5, 16 -; SI-NEXT: v_alignbit_b32 v44, v18, v5, 8 -; SI-NEXT: v_alignbit_b32 v21, v17, v13, 24 -; SI-NEXT: v_alignbit_b32 v22, v17, v13, 16 -; SI-NEXT: v_alignbit_b32 v24, v17, v13, 8 -; SI-NEXT: v_alignbit_b32 v17, v20, v9, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v9, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v9, 8 -; SI-NEXT: v_alignbit_b32 v59, v32, v6, 24 -; SI-NEXT: v_alignbit_b32 v60, v32, v6, 16 -; SI-NEXT: v_alignbit_b32 v61, v32, v6, 8 -; SI-NEXT: v_alignbit_b32 v46, v33, v4, 24 -; SI-NEXT: v_alignbit_b32 v56, v33, v4, 16 -; SI-NEXT: v_alignbit_b32 v58, v33, v4, 8 -; SI-NEXT: v_alignbit_b32 v55, v34, v2, 24 -; SI-NEXT: v_alignbit_b32 v40, v34, v2, 16 -; SI-NEXT: v_alignbit_b32 v42, v34, v2, 8 -; SI-NEXT: v_alignbit_b32 v52, v35, v1, 24 -; SI-NEXT: v_alignbit_b32 v53, v35, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v35, v1, 8 -; SI-NEXT: s_lshr_b32 s83, s57, 24 -; SI-NEXT: s_lshr_b32 s71, s57, 16 -; SI-NEXT: s_lshr_b32 s68, s57, 8 -; SI-NEXT: s_lshr_b32 s80, s47, 24 -; SI-NEXT: s_lshr_b32 s69, s47, 16 -; SI-NEXT: s_lshr_b32 s65, s47, 8 -; SI-NEXT: s_lshr_b32 s70, s45, 24 -; SI-NEXT: s_lshr_b32 s66, s45, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 8 -; SI-NEXT: s_lshr_b32 s67, s43, 24 -; SI-NEXT: s_lshr_b32 s55, s43, 16 -; SI-NEXT: s_lshr_b32 s51, s43, 8 -; SI-NEXT: s_lshr_b32 s64, s41, 24 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 8 -; SI-NEXT: s_lshr_b32 s53, s40, 24 +; SI-NEXT: v_alignbit_b32 v3, v15, v9, 8 +; SI-NEXT: v_alignbit_b32 v55, v19, v6, 24 +; SI-NEXT: v_alignbit_b32 v40, v19, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v19, v6, 8 +; SI-NEXT: v_alignbit_b32 v52, v18, v5, 24 +; SI-NEXT: v_alignbit_b32 v53, v18, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v18, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, v17, v2, 24 +; SI-NEXT: v_alignbit_b32 v22, v17, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v2, 8 +; SI-NEXT: v_alignbit_b32 v17, v20, v14, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v14, 8 +; SI-NEXT: v_alignbit_b32 v59, v32, v10, 24 +; SI-NEXT: v_alignbit_b32 v60, v32, v10, 16 +; SI-NEXT: v_alignbit_b32 v61, v32, v10, 8 +; SI-NEXT: v_alignbit_b32 v56, v33, v8, 24 +; SI-NEXT: v_alignbit_b32 v57, v33, v8, 16 +; SI-NEXT: v_alignbit_b32 v58, v33, v8, 8 +; SI-NEXT: v_alignbit_b32 v45, v34, v4, 24 +; SI-NEXT: v_alignbit_b32 v46, v34, v4, 16 +; SI-NEXT: v_alignbit_b32 v47, v34, v4, 8 +; SI-NEXT: v_alignbit_b32 v42, v35, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v35, v1, 16 +; SI-NEXT: v_alignbit_b32 v44, v35, v1, 8 +; SI-NEXT: s_lshr_b32 s21, s58, 24 +; SI-NEXT: s_lshr_b32 s18, s58, 16 +; SI-NEXT: s_lshr_b32 s60, s58, 8 +; SI-NEXT: s_lshr_b32 s17, s46, 24 +; SI-NEXT: s_lshr_b32 s56, s46, 16 +; SI-NEXT: s_lshr_b32 s96, s46, 8 +; SI-NEXT: s_lshr_b32 s59, s44, 24 +; SI-NEXT: s_lshr_b32 s86, s44, 16 +; SI-NEXT: s_lshr_b32 s80, s44, 8 +; SI-NEXT: s_lshr_b32 s87, s42, 24 +; SI-NEXT: s_lshr_b32 s71, s42, 16 +; SI-NEXT: s_lshr_b32 s65, s42, 8 +; SI-NEXT: s_lshr_b32 s69, s41, 24 +; SI-NEXT: s_lshr_b32 s64, s41, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 8 +; SI-NEXT: s_lshr_b32 s55, s40, 24 ; SI-NEXT: s_lshr_b32 s49, s40, 16 -; SI-NEXT: s_lshr_b32 s37, s40, 8 -; SI-NEXT: s_lshr_b32 s50, s15, 24 +; SI-NEXT: s_lshr_b32 s39, s40, 8 +; SI-NEXT: s_lshr_b32 s48, s15, 24 ; SI-NEXT: s_lshr_b32 s38, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s15, 8 -; SI-NEXT: s_lshr_b32 s39, s14, 24 +; SI-NEXT: s_lshr_b32 s36, s15, 8 +; SI-NEXT: s_lshr_b32 s37, s14, 24 ; SI-NEXT: s_lshr_b32 s35, s14, 16 -; SI-NEXT: s_lshr_b32 s95, s14, 8 -; SI-NEXT: s_lshr_b32 s36, s13, 24 +; SI-NEXT: s_lshr_b32 s31, s14, 8 +; SI-NEXT: s_lshr_b32 s34, s13, 24 ; SI-NEXT: s_lshr_b32 s30, s13, 16 -; SI-NEXT: s_lshr_b32 s92, s13, 8 -; SI-NEXT: s_lshr_b32 s31, s12, 24 -; SI-NEXT: s_lshr_b32 s93, s12, 16 +; SI-NEXT: s_lshr_b32 s93, s13, 8 +; SI-NEXT: s_lshr_b32 s95, s12, 24 +; SI-NEXT: s_lshr_b32 s92, s12, 16 ; SI-NEXT: s_lshr_b32 s89, s12, 8 ; SI-NEXT: s_lshr_b32 s94, s11, 24 ; SI-NEXT: s_lshr_b32 s90, s11, 16 +; SI-NEXT: s_lshr_b32 s78, s11, 8 ; SI-NEXT: s_lshr_b32 s91, s10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: s_lshr_b32 s79, s10, 16 +; SI-NEXT: s_lshr_b32 s75, s10, 8 +; SI-NEXT: s_lshr_b32 s88, s9, 24 +; SI-NEXT: s_lshr_b32 s76, s9, 16 +; SI-NEXT: s_lshr_b32 s77, s8, 24 +; SI-NEXT: s_lshr_b32 s25, s7, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_lshl_b32 s16, s83, 24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_lshl_b32 s16, s21, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 ; SI-NEXT: v_readlane_b32 s85, v63, 29 ; SI-NEXT: v_readlane_b32 s84, v63, 28 ; SI-NEXT: v_readlane_b32 s83, v63, 27 ; SI-NEXT: v_readlane_b32 s82, v63, 26 ; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s70, v63, 22 ; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_and_b32 s4, s58, 0xff ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -209430,29 +210181,25 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s69, 0xff +; SI-NEXT: s_and_b32 s5, s56, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s80, 24 +; SI-NEXT: s_lshl_b32 s16, s17, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_lshl_b32 s16, s70, 24 +; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_lshl_b32 s16, s59, 24 +; SI-NEXT: v_readlane_b32 s96, v63, 32 ; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s54, v63, 14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -209466,28 +210213,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v48 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_and_b32 s5, s86, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_lshl_b32 s16, s67, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_lshl_b32 s16, s87, 24 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s65, v63, 17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 @@ -209499,24 +210244,24 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v50 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s42, 0xff ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s55, 0xff +; SI-NEXT: s_and_b32 s5, s71, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_lshl_b32 s16, s64, 24 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s51, v63, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -209530,32 +210275,30 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s41, 0xff ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: s_and_b32 s5, s64, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s37, 8 -; SI-NEXT: s_lshl_b32 s16, s53, 24 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_lshl_b32 s5, s39, 8 +; SI-NEXT: s_lshl_b32 s16, s55, 24 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s39, v63, 7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 @@ -209566,7 +210309,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v13 ; SI-NEXT: s_and_b32 s4, s40, 0xff ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s49, 0xff @@ -209574,9 +210317,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_lshl_b32 s5, s36, 8 ; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 @@ -209603,13 +210346,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s38, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_lshl_b32 s15, s48, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s15, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s95, 8 -; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: s_lshl_b32 s5, s31, 8 +; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 @@ -209631,30 +210375,30 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 ; SI-NEXT: s_and_b32 s4, s14, 0xff ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s35, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 +; SI-NEXT: s_lshl_b32 s14, s37, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s14, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: v_readlane_b32 s37, v63, 5 ; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen @@ -209662,45 +210406,45 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v41 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v55 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s13, s36, 24 +; SI-NEXT: s_lshl_b32 s13, s34, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s13, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 ; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s89, 8 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v52 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: s_lshl_b32 s12, s95, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s12, s5 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 @@ -209710,131 +210454,124 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 1 ; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v24 ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s90, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v21 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s11, s94, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s11, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v18 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 2 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s10, s91, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 6 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 9 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v61 ; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v60 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v59 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_lshl_b32 s9, s88, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s9, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 9 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s5, v62, 11 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s8, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v58 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 +; SI-NEXT: v_readlane_b32 s5, v62, 10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v56 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s8, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_lshl_b32 s8, s77, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 12 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s5, v62, 13 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 11 +; SI-NEXT: v_readlane_b32 s5, v62, 12 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 10 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_lshl_b32 s7, s25, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -209842,7 +210579,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 @@ -209850,16 +210587,16 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 14 +; SI-NEXT: v_readlane_b32 s5, v62, 15 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209888,8 +210625,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload @@ -209902,12 +210638,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: v_mov_b32_e32 v38, v22 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 @@ -209922,139 +210652,136 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: v_mov_b32_e32 v26, v18 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: v_mov_b32_e32 v28, v17 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v25, v14 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v30, v10 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr83 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vcc_lo ; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; kill: killed $sgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -210783,17 +211510,17 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 2 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 2 ; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: v_readlane_b32 s17, v21, 1 ; VI-NEXT: v_readlane_b32 s18, v21, 0 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen @@ -212553,257 +213280,258 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-NEXT: v_writelane_b32 v75, s96, 0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_readfirstlane_b32 s40, v1 ; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-NEXT: v_writelane_b32 v75, s97, 1 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-NEXT: v_writelane_b32 v75, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-NEXT: v_writelane_b32 v75, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-NEXT: v_writelane_b32 v75, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 -; GFX11-NEXT: s_mov_b32 s99, 0 +; GFX11-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-NEXT: v_writelane_b32 v75, s102, 6 ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-NEXT: v_writelane_b32 v74, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB99_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s27, 8 ; GFX11-NEXT: s_lshr_b32 s43, s27, 24 ; GFX11-NEXT: s_lshr_b32 s34, s5, 24 ; GFX11-NEXT: s_lshr_b32 s35, s5, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s26, 16 ; GFX11-NEXT: s_lshr_b32 s37, s5, 8 ; GFX11-NEXT: s_lshr_b32 s36, s4, 16 ; GFX11-NEXT: s_lshr_b32 s38, s4, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s26, 8 ; GFX11-NEXT: s_lshr_b32 s39, s7, 24 ; GFX11-NEXT: s_lshr_b32 s48, s7, 16 ; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s25, 24 ; GFX11-NEXT: s_lshr_b32 s49, s6, 16 ; GFX11-NEXT: s_lshr_b32 s51, s6, 8 ; GFX11-NEXT: s_lshr_b32 s52, s9, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_lshr_b32 s53, s9, 16 ; GFX11-NEXT: s_lshr_b32 s55, s9, 8 ; GFX11-NEXT: s_lshr_b32 s54, s8, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s25, 8 ; GFX11-NEXT: s_lshr_b32 s64, s8, 8 ; GFX11-NEXT: s_lshr_b32 s65, s11, 24 ; GFX11-NEXT: s_lshr_b32 s66, s11, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s24, 16 ; GFX11-NEXT: s_lshr_b32 s68, s11, 8 ; GFX11-NEXT: s_lshr_b32 s67, s10, 16 ; GFX11-NEXT: s_lshr_b32 s69, s10, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v76, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s24, 8 ; GFX11-NEXT: s_lshr_b32 s70, s13, 24 ; GFX11-NEXT: s_lshr_b32 s71, s13, 16 ; GFX11-NEXT: s_lshr_b32 s81, s13, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-NEXT: v_writelane_b32 v77, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_lshr_b32 s82, s12, 8 ; GFX11-NEXT: s_lshr_b32 s83, s15, 24 ; GFX11-NEXT: s_lshr_b32 s84, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v77, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_lshr_b32 s86, s15, 8 ; GFX11-NEXT: s_lshr_b32 s85, s14, 16 ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v77, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 ; GFX11-NEXT: s_lshr_b32 s96, s41, 24 ; GFX11-NEXT: s_lshr_b32 s97, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 ; GFX11-NEXT: s_lshr_b32 s98, s40, 16 -; GFX11-NEXT: s_lshr_b32 s101, s40, 8 -; GFX11-NEXT: s_lshr_b32 s102, s29, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-NEXT: s_lshr_b32 s100, s40, 8 +; GFX11-NEXT: s_lshr_b32 s101, s29, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s103, s29, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 8 -; GFX11-NEXT: s_lshr_b32 s104, s28, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-NEXT: v_writelane_b32 v77, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-NEXT: v_writelane_b32 v77, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-NEXT: v_writelane_b32 v77, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s20, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-NEXT: v_writelane_b32 v77, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-NEXT: v_writelane_b32 v77, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-NEXT: v_writelane_b32 v77, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-NEXT: v_writelane_b32 v77, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s18, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-NEXT: v_writelane_b32 v77, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-NEXT: v_writelane_b32 v77, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-NEXT: v_writelane_b32 v77, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-NEXT: v_writelane_b32 v77, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s17, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-NEXT: v_writelane_b32 v77, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-NEXT: v_writelane_b32 v77, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s16, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-NEXT: v_writelane_b32 v77, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-NEXT: v_writelane_b32 v77, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-NEXT: v_writelane_b32 v77, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-NEXT: v_writelane_b32 v77, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: v_writelane_b32 v77, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: v_writelane_b32 v77, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: v_writelane_b32 v77, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: v_writelane_b32 v77, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: v_writelane_b32 v77, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s0, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: v_writelane_b32 v77, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: v_writelane_b32 v77, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s28, 8 -; GFX11-NEXT: v_writelane_b32 v78, s74, 0 -; GFX11-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-NEXT: v_writelane_b32 v77, s74, 0 +; GFX11-NEXT: v_writelane_b32 v77, s75, 1 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi ; GFX11-NEXT: s_cbranch_vccnz .LBB99_4 ; GFX11-NEXT: .LBB99_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v53, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v33, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v32, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s41, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, s40, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, s41, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s40, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] @@ -212814,115 +213542,112 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v53, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v55, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v21, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v20, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[24:25] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] ; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] -; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[20:21] +; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] ; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] ; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] ; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 24, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 16, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 24, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 8, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v13 ; GFX11-NEXT: s_branch .LBB99_5 ; GFX11-NEXT: .LBB99_3: ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s99, -1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr44 @@ -212930,13 +213655,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr58 ; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr98 ; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr99 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr87 @@ -212980,8 +213705,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v77, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -213060,419 +213785,422 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB99_2 ; GFX11-NEXT: .LBB99_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-NEXT: v_dual_mov_b32 v54, s0 :: v_dual_mov_b32 v55, s1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 3 ; GFX11-NEXT: v_mov_b32_e32 v71, s50 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v74, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 ; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 ; GFX11-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 -; GFX11-NEXT: v_mov_b32_e32 v73, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 4 ; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 ; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 -; GFX11-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 5 ; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 -; GFX11-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_dual_mov_b32 v51, s48 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v38, s16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-NEXT: v_dual_mov_b32 v39, s17 :: v_dual_mov_b32 v62, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_dual_mov_b32 v49, s39 :: v_dual_mov_b32 v36, s18 ; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 ; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-NEXT: v_readlane_b32 s0, v77, 8 ; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 ; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 -; GFX11-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 -; GFX11-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v20, s26 +; GFX11-NEXT: v_mov_b32_e32 v21, s27 +; GFX11-NEXT: v_dual_mov_b32 v145, s42 :: v_dual_mov_b32 v144, s103 +; GFX11-NEXT: v_mov_b32_e32 v58, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-NEXT: v_mov_b32_e32 v31, s36 +; GFX11-NEXT: v_dual_mov_b32 v135, s104 :: v_dual_mov_b32 v134, s102 +; GFX11-NEXT: v_dual_mov_b32 v133, s101 :: v_dual_mov_b32 v132, s100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v56, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-NEXT: v_dual_mov_b32 v131, s98 :: v_dual_mov_b32 v130, s99 +; GFX11-NEXT: v_dual_mov_b32 v129, s97 :: v_dual_mov_b32 v128, s96 ; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 -; GFX11-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 -; GFX11-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 -; GFX11-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 -; GFX11-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-NEXT: v_dual_mov_b32 v119, s87 :: v_dual_mov_b32 v118, s85 +; GFX11-NEXT: v_dual_mov_b32 v117, s86 :: v_dual_mov_b32 v116, s84 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 -; GFX11-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 -; GFX11-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 -; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-NEXT: v_mov_b32_e32 v27, s37 +; GFX11-NEXT: v_dual_mov_b32 v115, s83 :: v_dual_mov_b32 v114, s82 +; GFX11-NEXT: v_dual_mov_b32 v113, s80 :: v_dual_mov_b32 v112, s81 +; GFX11-NEXT: v_mov_b32_e32 v46, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-NEXT: v_dual_mov_b32 v103, s71 :: v_dual_mov_b32 v102, s70 +; GFX11-NEXT: v_dual_mov_b32 v101, s69 :: v_dual_mov_b32 v100, s67 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 -; GFX11-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-NEXT: v_dual_mov_b32 v99, s68 :: v_dual_mov_b32 v98, s66 +; GFX11-NEXT: v_dual_mov_b32 v97, s65 :: v_dual_mov_b32 v96, s64 ; GFX11-NEXT: v_mov_b32_e32 v43, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 -; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-NEXT: v_mov_b32_e32 v23, s35 +; GFX11-NEXT: v_dual_mov_b32 v87, s54 :: v_dual_mov_b32 v86, s55 +; GFX11-NEXT: v_dual_mov_b32 v85, s53 :: v_dual_mov_b32 v84, s52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v44, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-NEXT: v_mov_b32_e32 v19, s34 +; GFX11-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v82, s49 +; GFX11-NEXT: v_dual_mov_b32 v65, s60 :: v_dual_mov_b32 v30, s90 ; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 -; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s92 +; GFX11-NEXT: v_mov_b32_e32 v17, s74 +; GFX11-NEXT: v_mov_b32_e32 v67, s30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v41, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: v_mov_b32_e32 v48, s62 -; GFX11-NEXT: v_mov_b32_e32 v54, s72 -; GFX11-NEXT: v_mov_b32_e32 v64, s60 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v80, s46 -; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-NEXT: v_mov_b32_e32 v146, s43 +; GFX11-NEXT: v_mov_b32_e32 v50, s62 +; GFX11-NEXT: v_mov_b32_e32 v64, s72 +; GFX11-NEXT: v_mov_b32_e32 v66, s58 ; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-NEXT: v_mov_b32_e32 v80, s44 +; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_mov_b32_e32 v22, s78 ; GFX11-NEXT: v_mov_b32_e32 v182, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-NEXT: v_mov_b32_e32 v26, s88 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v48, s94 :: v_dual_mov_b32 v183, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 22 ; GFX11-NEXT: v_mov_b32_e32 v181, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-NEXT: v_readlane_b32 s0, v77, 23 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 24 ; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v177, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v176, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-NEXT: v_readlane_b32 s0, v77, 28 ; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-NEXT: v_readlane_b32 s0, v77, 29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v165, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 ; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v77, 30 ; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 ; GFX11-NEXT: v_mov_b32_e32 v163, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_readlane_b32 s0, v76, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 -; GFX11-NEXT: v_mov_b32_e32 v160, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 ; GFX11-NEXT: v_mov_b32_e32 v161, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 -; GFX11-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-NEXT: v_readlane_b32 s1, v77, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v81, s0 ; GFX11-NEXT: .LBB99_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v82 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v63 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GFX11-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v73 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v55 ; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v66, v69, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v81 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v72 ; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-NEXT: v_and_b32_e32 v46, 0xff, v46 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v68, v81, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-NEXT: v_or_b32_e32 v70, v46, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 ; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v81 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v61 +; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v62, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v81, v61 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-NEXT: v_or_b32_e32 v69, v41, v69 +; GFX11-NEXT: v_or_b32_e32 v52, v52, v54 +; GFX11-NEXT: v_or_b32_e32 v54, v62, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54 ; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_or_b32_e32 v69, v69, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v68 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v53 +; GFX11-NEXT: v_or_b32_e32 v53, v55, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v47 ; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_or_b32_e32 v82, v50, v82 -; GFX11-NEXT: v_or_b32_e32 v81, v60, v81 -; GFX11-NEXT: v_or_b32_e32 v50, v52, v66 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v69 -; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v51 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-NEXT: v_or_b32_e32 v51, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v66, v66, v69 -; GFX11-NEXT: v_or_b32_e32 v69, v57, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v47 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v53, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v68 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v58 +; GFX11-NEXT: v_or_b32_e32 v80, v56, v57 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v45 +; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v44 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v70 ; GFX11-NEXT: v_or_b32_e32 v39, v39, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-NEXT: v_or_b32_e32 v70, v81, v43 ; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v42 -; GFX11-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v42 +; GFX11-NEXT: v_or_b32_e32 v36, v38, v68 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v80, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v70 ; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v183 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v68, v80, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v40 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v182 +; GFX11-NEXT: v_lshlrev_b32_e32 v182, 8, v183 ; GFX11-NEXT: v_or_b32_e32 v37, v38, v39 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v68 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v70, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v181 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v182 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-NEXT: v_or_b32_e32 v68, v70, v80 +; GFX11-NEXT: v_or_b32_e32 v69, v81, v182 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v68, 0xffff, v68 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v67, v80, v67 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-NEXT: v_or_b32_e32 v66, v80, v66 ; GFX11-NEXT: v_or_b32_e32 v33, v33, v81 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v39, v66, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v67 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v178 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v179 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v176 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v165 -; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v166 -; GFX11-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-NEXT: v_or_b32_e32 v39, v68, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v178 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v167 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v166 +; GFX11-NEXT: v_and_b32_e32 v164, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v165 +; GFX11-NEXT: v_or_b32_e32 v68, v68, v69 ; GFX11-NEXT: v_or_b32_e32 v28, v28, v70 -; GFX11-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-NEXT: v_or_b32_e32 v65, v80, v65 ; GFX11-NEXT: v_or_b32_e32 v29, v29, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-NEXT: v_or_b32_e32 v69, v164, v165 ; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 ; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v65 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 ; GFX11-NEXT: v_or_b32_e32 v36, v32, v66 -; GFX11-NEXT: v_or_b32_e32 v37, v33, v67 -; GFX11-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-NEXT: v_or_b32_e32 v37, v33, v68 +; GFX11-NEXT: v_or_b32_e32 v38, v28, v65 ; GFX11-NEXT: v_or_b32_e32 v39, v29, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v54 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v160 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v150 ; GFX11-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 ; GFX11-NEXT: v_or_b32_e32 v25, v25, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v52 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v150 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-NEXT: v_or_b32_e32 v29, v52, v53 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v50 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v147 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v147 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v146 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v54 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v48 -; GFX11-NEXT: v_or_b32_e32 v33, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v52 -; GFX11-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_or_b32_e32 v33, v52, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v67 ; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v50, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v52, v20, v32 -; GFX11-NEXT: v_or_b32_e32 v53, v21, v33 -; GFX11-NEXT: v_or_b32_e32 v64, v15, v48 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v134 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 -; GFX11-NEXT: v_or_b32_e32 v51, v25, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v54 +; GFX11-NEXT: v_or_b32_e32 v50, v55, v64 +; GFX11-NEXT: v_or_b32_e32 v52, v24, v28 +; GFX11-NEXT: v_or_b32_e32 v53, v25, v29 +; GFX11-NEXT: v_or_b32_e32 v54, v20, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v21, v33 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v131 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v129 -; GFX11-NEXT: v_or_b32_e32 v20, v25, v28 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v134 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v132 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v130 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v20, v21, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v25 +; GFX11-NEXT: v_or_b32_e32 v21, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v129 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v119 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v34 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v116 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v24 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v117 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v115 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v28 +; GFX11-NEXT: v_or_b32_e32 v25, v29, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-NEXT: v_or_b32_e32 v28, v34, v48 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 -; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v32 -; GFX11-NEXT: v_or_b32_e32 v25, v33, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: v_or_b32_e32 v65, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v66, v13, v20 -; GFX11-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v21 +; GFX11-NEXT: v_or_b32_e32 v64, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v65, v12, v28 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v30 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v113 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v112 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v103 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v112 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v102 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX11-NEXT: v_or_b32_e32 v14, v20, v21 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v101 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v103 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v28 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v87 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v99 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v98 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v20 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v25 -; GFX11-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-NEXT: v_or_b32_e32 v12, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v98 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v97 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v28 +; GFX11-NEXT: v_or_b32_e32 v22, v29, v22 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_or_b32_e32 v16, v21, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v13, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v14, v10, v14 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX11-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v66, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v67, v10, v12 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v20 +; GFX11-NEXT: v_or_b32_e32 v9, v5, v22 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v82 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v21 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v83 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v71 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 ; GFX11-NEXT: v_or_b32_e32 v10, v20, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v12 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v49 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v51 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v35 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v31 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 @@ -213480,99 +214208,101 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v27 ; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v23 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v18 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-NEXT: v_or_b32_e32 v12, v20, v17 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v21 ; GFX11-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v50 ; GFX11-NEXT: v_or_b32_e32 v10, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v11 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v19, v12 ; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-NEXT: v_readlane_b32 s30, v74, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -218382,11 +219112,11 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_writelane_b32 v43, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_writelane_b32 v43, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 @@ -218406,8 +219136,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB101_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB101_4 @@ -218415,548 +219146,551 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v15, v6, v5, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v33, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v33 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v34, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v34 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v33, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v33 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v35, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v35 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v34 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v33, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v33 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v34, v35, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v34, v35 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v36, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc +; VI-NEXT: v_bfe_u32 v18, v36, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v36 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v33, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v33, v34 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v37, v35, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v37, v35 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v36, v33, 16, 1 ; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v35 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v36, v33 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_bfe_u32 v38, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v33 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v38, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_bfe_u32 v39, v37, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v34 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v33, vcc +; VI-NEXT: v_add_u32_e32 v33, vcc, v39, v37 ; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v37 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v35 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v32 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v34, vcc +; VI-NEXT: v_add_u32_e32 v34, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 ; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_add_u32_e32 v35, vcc, v37, v36 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v32 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v37, vcc +; VI-NEXT: v_add_u32_e32 v35, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v37 ; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 ; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_bfe_u32 v37, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v19 ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 ; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 ; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_bfe_u32 v38, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v20 ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 ; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 ; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_bfe_u32 v39, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v21 ; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 ; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_bfe_u32 v48, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v22 ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 ; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_bfe_u32 v49, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v23 ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v24 ; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 ; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 ; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_bfe_u32 v50, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v24 ; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v25 ; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 ; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_bfe_u32 v51, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v25 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v26 ; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 ; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_bfe_u32 v52, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v26 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 ; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 ; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_bfe_u32 v53, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v27 ; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v28 ; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 ; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_bfe_u32 v54, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v28 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v29 ; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_bfe_u32 v55, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v29 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v30 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 ; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v31 +; VI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; VI-NEXT: v_bfe_u32 v41, v40, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v40 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x7fff, v41 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; VI-NEXT: v_bfe_u32 v41, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v31 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x7fff, v41 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_alignbit_b32 v0, v18, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; VI-NEXT: v_alignbit_b32 v32, v18, v32, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 @@ -218970,25 +219704,22 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v40, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v55, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v54, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v53, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v52, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v51, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v49, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v48, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v39, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v38, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v37, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v36, 16 +; VI-NEXT: v_alignbit_b32 v16, v18, v16, 16 ; VI-NEXT: s_branch .LBB101_5 ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -219010,13 +219741,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB101_5: ; %end -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 -; VI-NEXT: v_readlane_b32 s31, v42, 1 -; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: v_readlane_b32 s31, v43, 1 +; VI-NEXT: v_readlane_b32 s30, v43, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -219025,11 +219757,11 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 ; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 @@ -219049,9 +219781,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_cbranch_execnz .LBB101_4 @@ -219059,612 +219792,612 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 ; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_add_f32_e32 v8, s4, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v15, v5, 16, v6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v33, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v33 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v33, v34 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v34, v33 +; GFX9-NEXT: v_bfe_u32 v35, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v35, v3 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 -; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v35, v34 +; GFX9-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v36, v33 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 -; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v35, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 ; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v36, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 ; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v36, vcc ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 ; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v37, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v36, v37, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v33, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v33 -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc -; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v33 +; GFX9-NEXT: v_add_u32_e32 v33, v36, v37 ; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v36, vcc +; GFX9-NEXT: v_add_u32_e32 v33, v35, v34 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 -; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc -; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v17 +; GFX9-NEXT: v_add_u32_e32 v33, v39, v38 ; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; GFX9-NEXT: v_add_u32_e32 v34, v37, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_u32_e32 v36, v38, v35 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 ; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 -; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc -; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_add_u32_e32 v35, v39, v38 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 ; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v32 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc -; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 ; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v19 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc -; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 ; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_bfe_u32 v39, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v20 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc -; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v21 ; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 ; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v21 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc -; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 ; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 ; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_bfe_u32 v49, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v22 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc -; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v23 ; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 ; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_bfe_u32 v50, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v23 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc -; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v24 ; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 ; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_bfe_u32 v51, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v24 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc -; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 ; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 ; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_bfe_u32 v52, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v25 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc -; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v26 ; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 ; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_bfe_u32 v53, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v26 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc -; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v27 ; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 ; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_bfe_u32 v54, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v27 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc -; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v28 ; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_bfe_u32 v55, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v28 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc -; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_bfe_u32 v40, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v29 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc -; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v30 ; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 ; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 ; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_bfe_u32 v41, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v30 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v41, v42, vcc +; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v31 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v42, v43, vcc +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v41 ; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v40 ; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v55 ; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v54 ; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v53 ; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v52 ; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v51 ; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v50 ; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v49 ; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v48 ; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v39 ; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v38 ; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v37 ; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v36 ; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v16, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; GFX9-NEXT: v_lshl_or_b32 v31, v40, 16, v31 -; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 -; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 -; GFX9-NEXT: v_lshl_or_b32 v28, v53, 16, v28 -; GFX9-NEXT: v_lshl_or_b32 v27, v52, 16, v27 -; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v26 -; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 -; GFX9-NEXT: v_lshl_or_b32 v24, v49, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 -; GFX9-NEXT: v_lshl_or_b32 v22, v39, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v21 -; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 -; GFX9-NEXT: v_lshl_or_b32 v19, v36, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v32, v35, 16, v32 -; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v41, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v30, v40, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v29, v55, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v28, v54, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v27, v53, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v26, v52, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v24, v50, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v23, v49, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v39, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v37, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v32, v36, 16, v32 ; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 ; GFX9-NEXT: s_branch .LBB101_5 ; GFX9-NEXT: .LBB101_3: @@ -219687,14 +220420,15 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB101_5: ; %end -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v18, v32 -; GFX9-NEXT: v_readlane_b32 s31, v43, 1 -; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -223124,8 +223858,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -223268,8 +224002,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -227936,11 +228670,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_writelane_b32 v43, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_writelane_b32 v43, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 @@ -227960,8 +228694,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB105_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB105_4 @@ -227969,548 +228704,551 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v15, v6, v5, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v7, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v6, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v5, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v33, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v33 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v34, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v34 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v33, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v33 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v35, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v35 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v3, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v34 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v18, v33, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v33 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v34, v35, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v34, v35 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v36, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc +; VI-NEXT: v_bfe_u32 v18, v36, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v36 +; VI-NEXT: v_add_f32_e32 v34, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_bfe_u32 v33, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v33, v34 +; VI-NEXT: v_add_f32_e32 v35, s4, v0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v37, v35, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v33, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v37, v35 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v36, v33, 16, 1 ; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v35 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v36, v33 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_bfe_u32 v38, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v33 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v38, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_bfe_u32 v39, v37, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v34 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v33, vcc +; VI-NEXT: v_add_u32_e32 v33, vcc, v39, v37 ; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v37 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v35 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v32 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v34, vcc +; VI-NEXT: v_add_u32_e32 v34, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 ; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_add_u32_e32 v35, vcc, v37, v36 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v32 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v37, vcc +; VI-NEXT: v_add_u32_e32 v35, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v37 ; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 ; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_bfe_u32 v37, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v19 ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 ; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 ; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_bfe_u32 v38, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v20 ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 ; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 ; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_bfe_u32 v39, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v21 ; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 ; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_bfe_u32 v48, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v22 ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 ; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_bfe_u32 v49, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v23 ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v24 ; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 ; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 ; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_bfe_u32 v50, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v24 ; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v25 ; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 ; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_bfe_u32 v51, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v25 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v26 ; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 ; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_bfe_u32 v52, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v26 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 ; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 ; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_bfe_u32 v53, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v27 ; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v28 ; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 ; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_bfe_u32 v54, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v28 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v29 ; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_bfe_u32 v55, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v29 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v30 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 ; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v31 +; VI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; VI-NEXT: v_bfe_u32 v41, v40, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v40 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x7fff, v41 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; VI-NEXT: v_bfe_u32 v41, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v31 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x7fff, v41 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_alignbit_b32 v0, v18, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; VI-NEXT: v_alignbit_b32 v32, v18, v32, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 @@ -228524,25 +229262,22 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v40, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v55, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v54, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v53, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v52, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v51, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v49, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v48, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v39, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v38, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v37, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v36, 16 +; VI-NEXT: v_alignbit_b32 v16, v18, v16, 16 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: s_branch .LBB105_2 @@ -228564,13 +229299,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB105_5: ; %end -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 -; VI-NEXT: v_readlane_b32 s31, v42, 1 -; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: v_readlane_b32 s31, v43, 1 +; VI-NEXT: v_readlane_b32 s30, v43, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -228579,11 +229315,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: v_mov_b32_e32 v31, v17 ; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 @@ -228603,9 +229339,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 @@ -228613,549 +229350,550 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 ; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff0000 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_and_or_b32 v14, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_add_f32_e32 v8, s4, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_and_or_b32 v15, v3, v18, v4 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v15, v5, v18, v6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v14, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v13, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v13, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v12, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v12, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v11, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v11, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v10, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v10, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v9, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v9, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v8, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v8, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v6 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v7, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v7, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v6, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v5, v33, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v6, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v5, v33 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v5, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v5, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v33, v34 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v4, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v4, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v34, v33 +; GFX9-NEXT: v_bfe_u32 v35, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v35, v3 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v3, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 -; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_and_or_b32 v3, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v35, v34 +; GFX9-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v36, v33 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v2, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 -; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v35, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 ; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_and_or_b32 v2, v1, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v36, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 ; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v36, vcc ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 ; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v37, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v36, v37, 16, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v33 -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v33, v18, v0 -; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc -; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_add_f32_e32 v34, s4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v33 +; GFX9-NEXT: v_add_u32_e32 v33, v36, v37 ; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v36, vcc +; GFX9-NEXT: v_add_u32_e32 v33, v35, v34 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 -; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc -; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v17 +; GFX9-NEXT: v_add_u32_e32 v33, v39, v38 ; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; GFX9-NEXT: v_add_u32_e32 v34, v37, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_u32_e32 v36, v38, v35 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 ; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 -; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc -; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_add_u32_e32 v35, v39, v38 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 ; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v32 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc -; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 ; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v19 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc -; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 ; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_bfe_u32 v39, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v20 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc -; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v21 ; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 ; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v21 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc -; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 ; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 ; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_bfe_u32 v49, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v22 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc -; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v23 ; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 ; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_bfe_u32 v50, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v23 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc -; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v24 ; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 ; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_bfe_u32 v51, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v24 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc -; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 ; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 ; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_bfe_u32 v52, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v25 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc -; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v26 ; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 ; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_bfe_u32 v53, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v26 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc -; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v27 ; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 ; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_bfe_u32 v54, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v27 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc -; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v28 ; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_bfe_u32 v55, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v28 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc -; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_bfe_u32 v40, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v29 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc -; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v30 ; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 ; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 ; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_bfe_u32 v41, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v30 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v41, v42, vcc +; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v31 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v42, v43, vcc +; GFX9-NEXT: v_and_or_b32 v0, v0, v18, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 @@ -229170,23 +229908,22 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_and_or_b32 v31, v40, v18, v31 -; GFX9-NEXT: v_and_or_b32 v30, v55, v18, v30 -; GFX9-NEXT: v_and_or_b32 v29, v54, v18, v29 -; GFX9-NEXT: v_and_or_b32 v28, v53, v18, v28 -; GFX9-NEXT: v_and_or_b32 v27, v52, v18, v27 -; GFX9-NEXT: v_and_or_b32 v26, v51, v18, v26 -; GFX9-NEXT: v_and_or_b32 v25, v50, v18, v25 -; GFX9-NEXT: v_and_or_b32 v24, v49, v18, v24 -; GFX9-NEXT: v_and_or_b32 v23, v48, v18, v23 -; GFX9-NEXT: v_and_or_b32 v22, v39, v18, v22 -; GFX9-NEXT: v_and_or_b32 v21, v38, v18, v21 -; GFX9-NEXT: v_and_or_b32 v20, v37, v18, v20 -; GFX9-NEXT: v_and_or_b32 v19, v36, v18, v19 -; GFX9-NEXT: v_and_or_b32 v32, v35, v18, v32 -; GFX9-NEXT: v_and_or_b32 v17, v34, v18, v17 +; GFX9-NEXT: v_and_or_b32 v17, v17, v18, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v34 +; GFX9-NEXT: v_and_or_b32 v31, v41, v18, v31 +; GFX9-NEXT: v_and_or_b32 v30, v40, v18, v30 +; GFX9-NEXT: v_and_or_b32 v29, v55, v18, v29 +; GFX9-NEXT: v_and_or_b32 v28, v54, v18, v28 +; GFX9-NEXT: v_and_or_b32 v27, v53, v18, v27 +; GFX9-NEXT: v_and_or_b32 v26, v52, v18, v26 +; GFX9-NEXT: v_and_or_b32 v25, v51, v18, v25 +; GFX9-NEXT: v_and_or_b32 v24, v50, v18, v24 +; GFX9-NEXT: v_and_or_b32 v23, v49, v18, v23 +; GFX9-NEXT: v_and_or_b32 v22, v48, v18, v22 +; GFX9-NEXT: v_and_or_b32 v21, v39, v18, v21 +; GFX9-NEXT: v_and_or_b32 v20, v38, v18, v20 +; GFX9-NEXT: v_and_or_b32 v19, v37, v18, v19 +; GFX9-NEXT: v_and_or_b32 v32, v36, v18, v32 ; GFX9-NEXT: v_and_or_b32 v16, v33, v18, v16 ; GFX9-NEXT: s_branch .LBB105_5 ; GFX9-NEXT: .LBB105_3: @@ -229209,14 +229946,15 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB105_5: ; %end -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v18, v32 -; GFX9-NEXT: v_readlane_b32 s31, v43, 1 -; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -230975,6 +231713,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v40, s30, 0 ; SI-NEXT: v_writelane_b32 v40, s31, 1 @@ -231008,235 +231753,225 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: s_mov_b32 s60, s17 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s60, s16 +; SI-NEXT: s_mov_b32 s61, s16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 -; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 +; SI-NEXT: v_writelane_b32 v41, s60, 0 +; SI-NEXT: s_mov_b32 s62, s19 +; SI-NEXT: v_writelane_b32 v41, s61, 1 +; SI-NEXT: v_writelane_b32 v41, s62, 2 ; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s18, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 ; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s79, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 +; SI-NEXT: v_writelane_b32 v41, s79, 10 ; SI-NEXT: s_mov_b32 s93, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 ; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s63, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 +; SI-NEXT: v_readfirstlane_b32 s89, v4 +; SI-NEXT: v_writelane_b32 v41, s63, 14 +; SI-NEXT: v_readfirstlane_b32 s90, v3 +; SI-NEXT: v_writelane_b32 v41, s89, 15 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_writelane_b32 v41, s90, 16 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_writelane_b32 v41, s91, 17 ; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s71, v32 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s7, v33 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s84, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s87, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v41, s31, 18 ; SI-NEXT: v_readfirstlane_b32 s35, v7 ; SI-NEXT: v_writelane_b32 v41, s34, 19 ; SI-NEXT: v_readfirstlane_b32 s36, v10 ; SI-NEXT: v_writelane_b32 v41, s35, 20 -; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 ; SI-NEXT: v_readfirstlane_b32 s37, v9 ; SI-NEXT: v_writelane_b32 v41, s36, 21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s80, v32 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s84, v34 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s87, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s14, v30 -; SI-NEXT: v_readfirstlane_b32 s15, v29 -; SI-NEXT: v_readfirstlane_b32 s12, v28 -; SI-NEXT: v_readfirstlane_b32 s13, v27 -; SI-NEXT: v_readfirstlane_b32 s10, v26 -; SI-NEXT: v_readfirstlane_b32 s11, v25 -; SI-NEXT: v_readfirstlane_b32 s8, v24 -; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 -; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_writelane_b32 v41, s39, 24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s58, v37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s59, v38 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s56, v39 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s59, v32 +; SI-NEXT: v_readfirstlane_b32 s57, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s56, v33 +; SI-NEXT: v_readfirstlane_b32 s46, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s57, v39 +; SI-NEXT: v_readfirstlane_b32 s47, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s44, v51 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s47, v49 +; SI-NEXT: v_readfirstlane_b32 s45, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s44, v50 +; SI-NEXT: v_readfirstlane_b32 s42, v53 +; SI-NEXT: v_readfirstlane_b32 s12, v30 +; SI-NEXT: v_readfirstlane_b32 s13, v29 +; SI-NEXT: v_readfirstlane_b32 s10, v28 +; SI-NEXT: v_readfirstlane_b32 s11, v27 +; SI-NEXT: v_readfirstlane_b32 s8, v26 +; SI-NEXT: v_readfirstlane_b32 s9, v25 +; SI-NEXT: v_readfirstlane_b32 s88, v24 +; SI-NEXT: v_readfirstlane_b32 s29, v23 +; SI-NEXT: v_readfirstlane_b32 s78, v22 +; SI-NEXT: v_readfirstlane_b32 s27, v21 +; SI-NEXT: v_readfirstlane_b32 s77, v20 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_readfirstlane_b32 s75, v18 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_readfirstlane_b32 s73, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_writelane_b32 v41, s38, 23 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s43, v31 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s42, v34 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_readfirstlane_b32 s40, v32 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s43, v35 +; SI-NEXT: v_readfirstlane_b32 s41, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s40, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v37 +; SI-NEXT: v_readfirstlane_b32 s14, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s15, v35 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s4, s60, 16 +; SI-NEXT: s_lshl_b32 s4, s61, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 24 +; SI-NEXT: s_lshl_b32 s4, s18, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 ; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 +; SI-NEXT: v_writelane_b32 v41, s4, 26 ; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: v_writelane_b32 v41, s4, 27 ; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 +; SI-NEXT: v_writelane_b32 v41, s4, 28 ; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 +; SI-NEXT: v_writelane_b32 v41, s4, 29 ; SI-NEXT: s_lshl_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 30 +; SI-NEXT: s_lshl_b32 s4, s16, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 +; SI-NEXT: s_lshl_b32 s4, s90, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 +; SI-NEXT: s_lshl_b32 s4, s31, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 34 ; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: v_writelane_b32 v41, s4, 34 ; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_lshl_b32 s99, s60, 16 +; SI-NEXT: s_lshl_b32 s85, s62, 16 +; SI-NEXT: s_lshl_b32 s97, s72, 16 +; SI-NEXT: s_lshl_b32 s86, s74, 16 +; SI-NEXT: s_lshl_b32 s92, s76, 16 +; SI-NEXT: s_lshl_b32 s94, s79, 16 ; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 +; SI-NEXT: s_lshl_b32 s93, s63, 16 +; SI-NEXT: s_lshl_b32 s30, s89, 16 +; SI-NEXT: s_lshl_b32 s31, s91, 16 ; SI-NEXT: s_lshl_b32 s34, s34, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 36 ; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 36 ; SI-NEXT: s_lshl_b32 s36, s38, 16 +; SI-NEXT: s_lshl_b32 s83, s19, 16 +; SI-NEXT: s_lshl_b32 s37, s6, 16 ; SI-NEXT: s_lshl_b32 s22, s21, 16 -; SI-NEXT: s_lshl_b32 s37, s39, 16 +; SI-NEXT: s_lshl_b32 s38, s73, 16 ; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 +; SI-NEXT: s_lshl_b32 s39, s75, 16 +; SI-NEXT: s_lshl_b32 s26, s25, 16 +; SI-NEXT: s_lshl_b32 s48, s77, 16 +; SI-NEXT: s_lshl_b32 s79, s27, 16 +; SI-NEXT: s_lshl_b32 s49, s78, 16 +; SI-NEXT: s_lshl_b32 s61, s29, 16 +; SI-NEXT: s_lshl_b32 s50, s88, 16 +; SI-NEXT: s_lshl_b32 s89, s9, 16 +; SI-NEXT: s_lshl_b32 s91, s8, 16 ; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: s_lshl_b32 s51, s10, 16 +; SI-NEXT: s_lshl_b32 s20, s13, 16 +; SI-NEXT: s_lshl_b32 s52, s12, 16 +; SI-NEXT: s_lshl_b32 s28, s15, 16 +; SI-NEXT: s_lshl_b32 s53, s14, 16 +; SI-NEXT: s_lshl_b32 s62, s41, 16 +; SI-NEXT: s_lshl_b32 s54, s40, 16 +; SI-NEXT: s_lshl_b32 s70, s43, 16 +; SI-NEXT: s_lshl_b32 s55, s42, 16 +; SI-NEXT: s_lshl_b32 s72, s45, 16 +; SI-NEXT: s_lshl_b32 s64, s44, 16 +; SI-NEXT: s_lshl_b32 s80, s47, 16 +; SI-NEXT: s_lshl_b32 s65, s46, 16 +; SI-NEXT: s_lshl_b32 s74, s57, 16 +; SI-NEXT: s_lshl_b32 s66, s56, 16 +; SI-NEXT: s_lshl_b32 s81, s59, 16 +; SI-NEXT: s_lshl_b32 s67, s58, 16 +; SI-NEXT: s_lshl_b32 s76, s87, 16 +; SI-NEXT: s_mov_b32 s63, s68 ; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s82, s84, 16 +; SI-NEXT: s_mov_b32 s98, s7 +; SI-NEXT: s_lshl_b32 s69, s7, 16 +; SI-NEXT: s_lshl_b32 s18, s71, 16 +; SI-NEXT: s_mov_b32 s7, s96 +; SI-NEXT: s_lshl_b32 s60, s96, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: s_mov_b32 s63, s68 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: s_mov_b32 s98, s7 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: s_mov_b32 s7, s96 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr95 @@ -231245,107 +231980,97 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; kill: killed $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s5, s18 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 24 -; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: s_lshl_b32 s18, s6, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_or_b32 s7, s7, s17 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 +; SI-NEXT: v_readlane_b32 s17, v41, 21 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 20 ; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: v_readlane_b32 s19, v41, 19 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s12, s12, 16 @@ -231354,108 +232079,122 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s9, s29, 0xffff ; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s20, s73, 16 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: v_readlane_b32 s19, v41, 18 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_lshl_b32 s13, s78, 16 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_add_i32 s96, s19, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 17 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 +; SI-NEXT: s_lshl_b32 s15, s77, 16 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_and_b32 s19, s96, 0xffff ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 -; SI-NEXT: s_or_b32 s18, s21, s18 -; SI-NEXT: v_readlane_b32 s21, v41, 13 +; SI-NEXT: s_lshl_b32 s22, s75, 16 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: v_readlane_b32 s21, v41, 16 ; SI-NEXT: s_or_b32 s15, s22, s15 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_readlane_b32 s22, v41, 12 -; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_add_i32 s85, s21, 3 +; SI-NEXT: v_readlane_b32 s22, v41, 15 +; SI-NEXT: s_and_b32 s21, s85, 0xffff ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_or_b32 s21, s22, s21 -; SI-NEXT: v_readlane_b32 s22, v41, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s22, v41, 14 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s16, s22, s16 +; SI-NEXT: v_readlane_b32 s22, v41, 13 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: v_readlane_b32 s23, v41, 12 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_readlane_b32 s23, v41, 9 +; SI-NEXT: v_readlane_b32 s23, v41, 11 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_readlane_b32 s24, v41, 8 +; SI-NEXT: v_readlane_b32 s24, v41, 10 ; SI-NEXT: s_and_b32 s23, s23, 0xffff ; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_or_b32 s23, s24, s23 -; SI-NEXT: v_readlane_b32 s24, v41, 7 +; SI-NEXT: v_readlane_b32 s24, v41, 9 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v41, 6 +; SI-NEXT: v_readlane_b32 s25, v41, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v41, 5 +; SI-NEXT: v_readlane_b32 s25, v41, 7 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_readlane_b32 s26, v41, 4 +; SI-NEXT: v_readlane_b32 s26, v41, 6 ; SI-NEXT: s_and_b32 s25, s25, 0xffff ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_or_b32 s25, s26, s25 -; SI-NEXT: v_readlane_b32 s26, v41, 3 +; SI-NEXT: v_readlane_b32 s26, v41, 5 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_readlane_b32 s27, v41, 2 +; SI-NEXT: v_readlane_b32 s27, v41, 4 ; SI-NEXT: s_and_b32 s26, s26, 0xffff ; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_readlane_b32 s27, v41, 1 +; SI-NEXT: v_readlane_b32 s27, v41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: v_readlane_b32 s28, v41, 0 +; SI-NEXT: v_readlane_b32 s28, v41, 2 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 +; SI-NEXT: v_readlane_b32 s28, v41, 1 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_readlane_b32 s29, v41, 0 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_add_i32 s28, s28, 0x30000 ; SI-NEXT: s_add_i32 s27, s27, 0x30000 +; SI-NEXT: s_and_b32 s99, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 +; SI-NEXT: v_writelane_b32 v41, s28, 24 +; SI-NEXT: s_and_b32 s85, s27, 0xffff0000 ; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_add_i32 s25, s25, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 +; SI-NEXT: s_and_b32 s97, s26, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_add_i32 s24, s24, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 +; SI-NEXT: s_and_b32 s86, s25, 0xffff0000 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_add_i32 s23, s23, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 +; SI-NEXT: s_and_b32 s92, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_add_i32 s80, s80, 3 ; SI-NEXT: s_add_i32 s22, s22, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s94, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s4, s80, 0xffff -; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 +; SI-NEXT: s_and_b32 s95, s22, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s22, 30 +; SI-NEXT: s_and_b32 s93, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s84, 0xffff -; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_and_b32 s60, s83, 0xffff ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -231464,12 +232203,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: v_writelane_b32 v41, s16, 31 +; SI-NEXT: s_lshl_b32 s16, s21, 16 +; SI-NEXT: s_or_b32 s5, s60, s5 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_lshl_b32 s61, s63, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -231482,10 +232220,10 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 32 +; SI-NEXT: s_lshl_b32 s16, s19, 16 +; SI-NEXT: s_or_b32 s76, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 @@ -231493,13 +232231,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s16, 33 +; SI-NEXT: s_lshl_b32 s16, s17, 16 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -231515,82 +232251,82 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 ; SI-NEXT: s_add_i32 s20, s20, 0x30000 -; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s35, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s30, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s31, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s34, s17, 0xffff0000 ; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 +; SI-NEXT: s_and_b32 s36, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s7, 16 -; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s37, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s18, 16 +; SI-NEXT: s_and_b32 s38, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 +; SI-NEXT: s_and_b32 s39, s15, 0xffff0000 ; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 -; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s48, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s13, 16 +; SI-NEXT: s_and_b32 s49, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s11, 16 +; SI-NEXT: s_and_b32 s50, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_and_b32 s91, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s8, 16 +; SI-NEXT: s_and_b32 s51, s10, 0xffff0000 ; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 +; SI-NEXT: s_and_b32 s52, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s12, 16 +; SI-NEXT: s_and_b32 s53, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s14, 16 +; SI-NEXT: s_and_b32 s54, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s40, 16 +; SI-NEXT: s_and_b32 s55, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s42, 16 +; SI-NEXT: s_and_b32 s64, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s44, 16 +; SI-NEXT: s_and_b32 s65, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s46, 16 +; SI-NEXT: s_and_b32 s66, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s56, 16 +; SI-NEXT: s_and_b32 s67, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s58, 16 ; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 ; SI-NEXT: s_lshl_b32 s76, s76, 16 ; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s5, 16 +; SI-NEXT: s_and_b32 s60, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s5, s4, 16 ; SI-NEXT: v_writelane_b32 v41, s6, 36 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_readlane_b32 s4, v41, 24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_readlane_b32 s4, v41, 25 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_readlane_b32 s4, v41, 26 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_readlane_b32 s4, v41, 27 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231598,7 +232334,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 +; SI-NEXT: v_readlane_b32 s4, v41, 28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231606,7 +232342,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 +; SI-NEXT: v_readlane_b32 s4, v41, 29 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231614,7 +232350,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 +; SI-NEXT: v_readlane_b32 s4, v41, 30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231622,7 +232358,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 +; SI-NEXT: v_readlane_b32 s4, v41, 31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231630,7 +232366,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 +; SI-NEXT: v_readlane_b32 s4, v41, 32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231638,7 +232374,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 +; SI-NEXT: v_readlane_b32 s4, v41, 33 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231646,7 +232382,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 +; SI-NEXT: v_readlane_b32 s4, v41, 34 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231654,7 +232390,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 +; SI-NEXT: v_readlane_b32 s4, v41, 35 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -231662,92 +232398,93 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_readlane_b32 s4, v41, 36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s83 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -231761,7 +232498,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s80 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -231775,7 +232512,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -231789,12 +232526,12 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -232114,8 +232851,8 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -233889,8 +234626,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 @@ -234033,8 +234770,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -236643,8 +237380,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 21ec3ee1996a6..e5a84239c1442 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -3820,17 +3820,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3844,32 +3846,31 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; SI-NEXT: .LBB26_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v11, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v5, v11 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -3878,42 +3879,43 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: .LBB26_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3941,17 +3943,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v16i8_to_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3965,16 +3969,12 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -3987,10 +3987,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: .LBB26_4: ; %cmp.true @@ -4011,8 +4015,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -4029,17 +4033,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4053,16 +4059,12 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -4075,10 +4077,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: .LBB26_4: ; %cmp.true @@ -4099,8 +4105,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -8286,17 +8292,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8310,32 +8318,31 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; SI-NEXT: .LBB50_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v11, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v5, v11 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -8344,42 +8351,43 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -8407,17 +8415,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v16i8_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8431,16 +8441,12 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB50_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -8453,10 +8459,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: .LBB50_4: ; %cmp.true @@ -8477,8 +8487,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -8495,17 +8505,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8519,16 +8531,12 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -8541,10 +8549,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: .LBB50_4: ; %cmp.true @@ -8565,8 +8577,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -12368,17 +12380,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12392,32 +12406,31 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; SI-NEXT: .LBB70_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v11, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v5, v11 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -12426,42 +12439,43 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: .LBB70_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -12489,17 +12503,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v16i8_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12513,16 +12529,12 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -12535,10 +12547,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: .LBB70_4: ; %cmp.true @@ -12559,8 +12575,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -12577,17 +12593,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12601,16 +12619,12 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -12623,10 +12637,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: .LBB70_4: ; %cmp.true @@ -12647,8 +12665,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -16060,17 +16078,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16084,32 +16104,31 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; SI-NEXT: .LBB86_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v11, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v5, v11 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -16118,42 +16137,43 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_2 ; SI-NEXT: .LBB86_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -16181,17 +16201,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v16i8_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16205,16 +16227,12 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -16227,10 +16245,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB86_2 ; VI-NEXT: .LBB86_4: ; %cmp.true @@ -16251,8 +16273,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -16269,17 +16291,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16293,16 +16317,12 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -16315,10 +16335,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB86_2 ; GFX9-NEXT: .LBB86_4: ; %cmp.true @@ -16339,8 +16363,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -18737,17 +18761,20 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v8 ; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v22, v5 ; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v23, v1 ; SI-NEXT: v_mov_b32_e32 v16, v6 ; SI-NEXT: v_mov_b32_e32 v17, v4 ; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_mov_b32_e32 v19, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -18777,11 +18804,11 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v8, v5, v24 +; SI-NEXT: v_or_b32_e32 v8, v5, v22 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v4, v1, v23 -; SI-NEXT: v_or_b32_e32 v12, v5, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v4, v1, v25 +; SI-NEXT: v_or_b32_e32 v12, v5, v24 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -18795,31 +18822,31 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v7, v21, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v20, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: .LBB96_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 @@ -18842,11 +18869,11 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v19, v3 ; VI-NEXT: v_mov_b32_e32 v18, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr3 @@ -18855,10 +18882,10 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 @@ -19468,76 +19495,75 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v16, v2 -; SI-NEXT: v_or_b32_e32 v2, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_or_b32_e32 v4, v21, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v7, v15, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v5, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v13, v0, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v10, v23, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v10, 16 -; SI-NEXT: v_or_b32_e32 v4, v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v21, v18, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v25, v13, v18 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v19, v1, v5 +; SI-NEXT: v_alignbit_b32 v1, v18, v5, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v13 +; SI-NEXT: v_or_b32_e32 v15, v5, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v17, v5, v24 +; SI-NEXT: v_alignbit_b32 v5, v15, v24, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: .LBB98_2: ; %Flow @@ -19546,73 +19572,78 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v13, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v1, v18, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v6, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19626,16 +19657,12 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB98_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -19648,10 +19675,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: .LBB98_4: ; %cmp.true @@ -19672,8 +19703,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -19690,17 +19721,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19714,16 +19747,12 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -19736,10 +19765,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: .LBB98_4: ; %cmp.true @@ -19760,8 +19793,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -21663,16 +21696,24 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v14, v8 +; SI-NEXT: v_mov_b32_e32 v15, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -21698,13 +21739,13 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v8, v20, v5 +; SI-NEXT: v_or_b32_e32 v8, v17, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v4, v16, v1 -; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_or_b32_e32 v4, v19, v1 +; SI-NEXT: v_or_b32_e32 v12, v16, v5 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -21716,18 +21757,18 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -21738,12 +21779,12 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v8, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22391,22 +22432,22 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -22414,32 +22455,24 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v4, v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v21 -; SI-NEXT: v_or_b32_e32 v4, v4, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v13, v13, v9 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v23, v18, v23 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -22447,10 +22480,18 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -22458,67 +22499,71 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: v_mov_b32_e32 v0, v17 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v4, v19 ; SI-NEXT: v_mov_b32_e32 v6, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22532,16 +22577,12 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -22554,10 +22595,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: .LBB106_4: ; %cmp.true @@ -22578,8 +22623,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -22596,17 +22641,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v8f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22620,16 +22667,12 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -22642,10 +22685,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: .LBB106_4: ; %cmp.true @@ -22666,8 +22713,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -23389,15 +23436,24 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -23424,55 +23480,55 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB108_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 ; SI-NEXT: v_alignbit_b32 v0, v0, v20, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v18, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v19, 16 ; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v17, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 ; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 ; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 ; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v16 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v18 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: .LBB108_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 @@ -24586,55 +24642,50 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v1 -; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_mov_b32_e32 v17, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: v_or_b32_e32 v11, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v21, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_or_b32_e32 v7, v13, v2 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v18, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_or_b32_e32 v19, v16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v21, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v15, v15, v23 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v11, v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -24642,94 +24693,102 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v15 -; SI-NEXT: v_mov_b32_e32 v4, v11 -; SI-NEXT: v_mov_b32_e32 v6, v16 +; SI-NEXT: v_mov_b32_e32 v0, v18 +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v6, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8bf16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v20, v1 ; VI-NEXT: v_mov_b32_e32 v18, v2 ; VI-NEXT: v_mov_b32_e32 v17, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -24743,16 +24802,12 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr4 @@ -24765,10 +24820,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: .LBB110_4: ; %cmp.true @@ -24789,8 +24848,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 ; VI-NEXT: v_add_u16_e32 v4, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 ; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -24807,17 +24866,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i8_to_v8bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -24831,16 +24892,12 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr4 @@ -24853,10 +24910,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: .LBB110_4: ; %cmp.true @@ -24877,8 +24938,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 ; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index c3ace0ac5af71..8080fc33db0c7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -7771,18 +7771,15 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v11 -; SI-NEXT: v_mov_b32_e32 v22, v10 -; SI-NEXT: v_mov_b32_e32 v21, v9 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v19, v7 -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_mov_b32_e32 v17, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -7793,34 +7790,22 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB56_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB56_4 -; SI-NEXT: .LBB56_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB56_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -7830,21 +7815,28 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: .LBB56_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -7854,10 +7846,14 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: v_mov_b32_e32 v11, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index c830d6b344b6f..4dc0a9b027a6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -739,14 +739,17 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v1 ; SI-NEXT: v_mov_b32_e32 v18, v6 ; SI-NEXT: v_mov_b32_e32 v17, v4 ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 @@ -770,25 +773,25 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 @@ -1128,18 +1131,17 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v7i32_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -1152,74 +1154,72 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: @@ -1475,21 +1475,28 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1501,41 +1508,41 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1544,25 +1551,25 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2206,14 +2213,17 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v1 ; SI-NEXT: v_mov_b32_e32 v18, v6 ; SI-NEXT: v_mov_b32_e32 v17, v4 ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 @@ -2237,25 +2247,25 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 @@ -2595,18 +2605,17 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v7f32_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -2619,74 +2628,72 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: @@ -2945,21 +2952,28 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2971,41 +2985,41 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3014,25 +3028,25 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3389,20 +3403,17 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 -; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v27, v10 +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -3415,36 +3426,24 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -3456,23 +3455,30 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -3484,10 +3490,14 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v11, v14 +; SI-NEXT: v_mov_b32_e32 v12, v15 +; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: @@ -4069,31 +4079,31 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s4, s20, 16 ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v13, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v14, s4 -; VI-NEXT: v_add_f16_e32 v7, s16, v0 -; VI-NEXT: v_add_f16_sdwa v8, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s17, v0 -; VI-NEXT: v_add_f16_sdwa v9, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 -; VI-NEXT: v_add_f16_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s19, v0 -; VI-NEXT: v_add_f16_sdwa v11, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s20, v0 -; VI-NEXT: v_add_f16_sdwa v12, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, s21, v0 -; VI-NEXT: v_add_f16_sdwa v13, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v6, s22, v0 -; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v0 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_or_b32_e32 v4, v4, v12 -; VI-NEXT: v_or_b32_e32 v3, v3, v11 -; VI-NEXT: v_or_b32_e32 v2, v2, v10 -; VI-NEXT: v_or_b32_e32 v1, v1, v9 -; VI-NEXT: v_or_b32_e32 v0, v7, v8 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s16, v0 +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: v_add_f16_e32 v9, s18, v0 +; VI-NEXT: v_add_f16_e32 v10, s19, v0 +; VI-NEXT: v_add_f16_e32 v11, s20, v0 +; VI-NEXT: v_add_f16_e32 v12, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v6, v14 +; VI-NEXT: v_add_f16_sdwa v14, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v12, v0 +; VI-NEXT: v_or_b32_e32 v4, v11, v4 +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_or_b32_e32 v2, v9, v2 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_or_b32_e32 v0, v7, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 38302a75fe26d..3c01977465d68 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1712,26 +1712,26 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -1755,8 +1755,8 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v23, v2 ; SI-NEXT: v_or_b32_e32 v3, v22, v3 ; SI-NEXT: v_or_b32_e32 v4, v21, v4 @@ -2107,23 +2107,20 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v8i32_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v7 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v5 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v20, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v1 -; SI-NEXT: v_mov_b32_e32 v23, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -2134,82 +2131,82 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v20 +; SI-NEXT: v_mov_b32_e32 v7, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16f16: @@ -2482,23 +2479,23 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2510,92 +2507,92 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -2967,15 +2964,16 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v8i32_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3002,41 +3000,41 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 @@ -3329,8 +3327,8 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 @@ -3355,7 +3353,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 @@ -3363,26 +3361,26 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_alignbit_b32 v1, v1, v24, 16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -3390,24 +3388,24 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -4809,16 +4807,19 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; VI-LABEL: bitcast_v8i32_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 ; VI-NEXT: v_mov_b32_e32 v35, v3 ; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr13 @@ -4832,39 +4833,38 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -4874,61 +4874,62 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 ; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v36 ; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v16, v36 +; VI-NEXT: v_mov_b32_e32 v20, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8i32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -4942,39 +4943,38 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -4984,46 +4984,44 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v35, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v34, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v37, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v36, 3, v36 ; GFX9-NEXT: v_add_u32_e32 v33, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v8i32_to_v32i8: @@ -5752,8 +5750,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 @@ -5772,9 +5770,9 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5797,51 +5795,49 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v23, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -5867,10 +5863,12 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -5975,8 +5973,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v34, v6 ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -5995,9 +5993,9 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6014,25 +6012,18 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -6050,18 +6041,25 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 @@ -6073,14 +6071,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -6133,8 +6131,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -6153,9 +6151,9 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6172,25 +6170,18 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -6208,18 +6199,25 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 @@ -6769,6 +6767,13 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 @@ -6782,21 +6787,14 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 @@ -6957,6 +6955,9 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -6965,12 +6966,9 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -7113,6 +7111,9 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -7121,12 +7122,9 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 @@ -8702,26 +8700,26 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -8745,8 +8743,8 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v23, v2 ; SI-NEXT: v_or_b32_e32 v3, v22, v3 ; SI-NEXT: v_or_b32_e32 v4, v21, v4 @@ -9097,23 +9095,20 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v8f32_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v7 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v5 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v20, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v1 -; SI-NEXT: v_mov_b32_e32 v23, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -9124,82 +9119,82 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_4 -; SI-NEXT: .LBB40_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB40_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 -; SI-NEXT: .LBB40_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v20 +; SI-NEXT: v_mov_b32_e32 v7, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: @@ -9473,23 +9468,23 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -9501,92 +9496,92 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -9958,15 +9953,16 @@ define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v8f32_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -9993,41 +9989,41 @@ define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 @@ -10322,8 +10318,8 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 @@ -10348,7 +10344,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 @@ -10356,26 +10352,26 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_alignbit_b32 v1, v1, v24, 16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -10383,24 +10379,24 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -11802,16 +11798,19 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-LABEL: bitcast_v8f32_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 ; VI-NEXT: v_mov_b32_e32 v35, v3 ; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr13 @@ -11825,39 +11824,38 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -11867,61 +11865,62 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 ; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; VI-NEXT: v_add_f32_e32 v37, 1.0, v37 +; VI-NEXT: v_add_f32_e32 v36, 1.0, v36 ; VI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v16, v36 +; VI-NEXT: v_mov_b32_e32 v20, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8f32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -11935,39 +11934,38 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -11977,46 +11975,44 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_add_f32_e32 v35, 1.0, v35 ; GFX9-NEXT: v_add_f32_e32 v34, 1.0, v34 +; GFX9-NEXT: v_add_f32_e32 v37, 1.0, v37 +; GFX9-NEXT: v_add_f32_e32 v36, 1.0, v36 ; GFX9-NEXT: v_add_f32_e32 v33, 1.0, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v8f32_to_v32i8: @@ -12407,12 +12403,12 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v16 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -12447,24 +12443,20 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: s_branch .LBB49_2 ; VI-NEXT: .LBB49_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 ; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 ; VI-NEXT: v_mov_b32_e32 v35, s59 ; VI-NEXT: v_mov_b32_e32 v2, s57 ; VI-NEXT: v_mov_b32_e32 v5, s58 ; VI-NEXT: v_mov_b32_e32 v6, s56 ; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v33, s46 ; VI-NEXT: v_mov_b32_e32 v10, s44 ; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s43 ; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v34, s41 ; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_mov_b32_e32 v21, s40 ; VI-NEXT: v_mov_b32_e32 v22, s28 @@ -12473,8 +12465,12 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v26, s24 ; VI-NEXT: v_mov_b32_e32 v29, s25 ; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: v_mov_b32_e32 v19, s8 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 @@ -12484,8 +12480,8 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v20, v17 ; VI-NEXT: v_mov_b32_e32 v28, v25 ; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v17, v34 ; VI-NEXT: v_mov_b32_e32 v25, v32 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -12542,12 +12538,12 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -12582,24 +12578,20 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: s_branch .LBB49_2 ; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 ; GFX9-NEXT: v_mov_b32_e32 v35, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v33, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v34, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 @@ -12608,8 +12600,12 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 -; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v19, s8 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 @@ -12619,8 +12615,8 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v20, v17 ; GFX9-NEXT: v_mov_b32_e32 v28, v25 ; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v17, v34 ; GFX9-NEXT: v_mov_b32_e32 v25, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12771,8 +12767,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 @@ -12791,9 +12787,9 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12816,51 +12812,49 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v23, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -12886,10 +12880,12 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -12994,8 +12990,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v34, v6 ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -13014,9 +13010,9 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13033,25 +13029,18 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -13069,18 +13058,25 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 @@ -13092,14 +13088,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -13152,8 +13148,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -13172,9 +13168,9 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13191,25 +13187,18 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -13227,18 +13216,25 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 @@ -13788,6 +13784,13 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 @@ -13801,21 +13804,14 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 @@ -13976,6 +13972,9 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -13984,12 +13983,9 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -14132,6 +14128,9 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -14140,12 +14139,9 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 @@ -15258,26 +15254,26 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -15301,8 +15297,8 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v23, v2 ; SI-NEXT: v_or_b32_e32 v3, v22, v3 ; SI-NEXT: v_or_b32_e32 v4, v21, v4 @@ -15653,23 +15649,20 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v4i64_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v13, v8 ; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -15680,70 +15673,65 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v21, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v19, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -15751,11 +15739,16 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v20 +; SI-NEXT: v_mov_b32_e32 v7, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: @@ -16030,23 +16023,23 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16058,92 +16051,92 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -16515,15 +16508,16 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v4i64_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -16550,41 +16544,41 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v16, vcc ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 @@ -16879,8 +16873,8 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 @@ -16905,7 +16899,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 @@ -16913,26 +16907,26 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_alignbit_b32 v1, v1, v24, 16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -16940,24 +16934,24 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -18359,16 +18353,19 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-LABEL: bitcast_v4i64_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 ; VI-NEXT: v_mov_b32_e32 v35, v3 ; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr13 @@ -18382,39 +18379,38 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -18424,61 +18420,62 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 ; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v36 +; VI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 ; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v16, v36 +; VI-NEXT: v_mov_b32_e32 v20, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4i64_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -18492,39 +18489,38 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -18534,46 +18530,44 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v34, vcc, 3, v34 ; GFX9-NEXT: v_addc_co_u32_e32 v35, vcc, 0, v35, vcc +; GFX9-NEXT: v_add_co_u32_e32 v36, vcc, 3, v36 +; GFX9-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v37, vcc ; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 ; GFX9-NEXT: v_addc_co_u32_e32 v33, vcc, 0, v33, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v4i64_to_v32i8: @@ -19308,8 +19302,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 @@ -19328,9 +19322,9 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19353,51 +19347,49 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v23, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -19423,10 +19415,12 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -19531,8 +19525,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v34, v6 ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -19551,9 +19545,9 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19570,25 +19564,18 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -19606,18 +19593,25 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 @@ -19629,14 +19623,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -19689,8 +19683,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -19709,9 +19703,9 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19728,25 +19722,18 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -19764,18 +19751,25 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 @@ -20325,6 +20319,13 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 @@ -20338,21 +20339,14 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 @@ -20513,6 +20507,9 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -20521,12 +20518,9 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -20669,6 +20663,9 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -20677,12 +20674,9 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 @@ -21343,26 +21337,26 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -21386,8 +21380,8 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v23, v2 ; SI-NEXT: v_or_b32_e32 v3, v22, v3 ; SI-NEXT: v_or_b32_e32 v4, v21, v4 @@ -21738,15 +21732,16 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v4f64_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -21759,33 +21754,33 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v24 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: .LBB76_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -21796,9 +21791,9 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 @@ -21808,27 +21803,27 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v0, v16 ; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v2, v21 -; SI-NEXT: v_mov_b32_e32 v3, v20 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v5, v17 +; SI-NEXT: v_mov_b32_e32 v2, v19 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v5, v21 ; SI-NEXT: v_mov_b32_e32 v6, v18 -; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: v_mov_b32_e32 v7, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: @@ -22081,23 +22076,23 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22109,92 +22104,92 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -22566,15 +22561,16 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v4f64_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -22590,22 +22586,22 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -22623,24 +22619,24 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v23 -; SI-NEXT: v_mov_b32_e32 v1, v22 -; SI-NEXT: v_mov_b32_e32 v2, v21 -; SI-NEXT: v_mov_b32_e32 v3, v20 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v6, v17 -; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_mov_b32_e32 v5, v17 +; SI-NEXT: v_mov_b32_e32 v6, v16 +; SI-NEXT: v_mov_b32_e32 v7, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16bf16: @@ -22897,8 +22893,8 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 @@ -22923,7 +22919,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 @@ -22931,26 +22927,26 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_alignbit_b32 v1, v1, v24, 16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -22958,24 +22954,24 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -24379,16 +24375,19 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-LABEL: bitcast_v4f64_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 ; VI-NEXT: v_mov_b32_e32 v35, v3 ; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr13 @@ -24402,99 +24401,99 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; VI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v16, v36 +; VI-NEXT: v_mov_b32_e32 v20, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4f64_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -24508,84 +24507,81 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; GFX9-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v4f64_to_v32i8: @@ -24828,13 +24824,13 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 ; SI-NEXT: v_alignbit_b32 v35, s17, v0, 8 ; SI-NEXT: s_lshr_b32 s25, s23, 24 -; SI-NEXT: s_lshr_b32 s24, s23, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 8 -; SI-NEXT: s_lshr_b32 s14, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 8 +; SI-NEXT: s_lshr_b32 s24, s21, 24 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s13, s19, 24 +; SI-NEXT: s_lshr_b32 s11, s19, 16 ; SI-NEXT: s_lshr_b32 s9, s19, 8 ; SI-NEXT: s_lshr_b32 s8, s17, 24 ; SI-NEXT: s_lshr_b32 s7, s17, 16 @@ -24881,19 +24877,19 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: @@ -24901,22 +24897,22 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v9, s19 ; SI-NEXT: v_mov_b32_e32 v17, s21 ; SI-NEXT: v_mov_b32_e32 v25, s23 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v7, s8 ; SI-NEXT: v_mov_b32_e32 v6, s7 ; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v15, s11 -; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v14, s11 ; SI-NEXT: v_mov_b32_e32 v13, s9 -; SI-NEXT: v_mov_b32_e32 v23, s14 -; SI-NEXT: v_mov_b32_e32 v22, s13 -; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s15 +; SI-NEXT: v_mov_b32_e32 v21, s14 ; SI-NEXT: v_mov_b32_e32 v31, s25 -; SI-NEXT: v_mov_b32_e32 v30, s24 -; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v30, s12 +; SI-NEXT: v_mov_b32_e32 v29, s10 ; SI-NEXT: .LBB85_5: ; %end ; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: v_mov_b32_e32 v12, v9 @@ -24956,8 +24952,8 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s16, 8 ; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 @@ -24992,13 +24988,13 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr15 ; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr25 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr27 @@ -25020,10 +25016,6 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v8, s18 ; VI-NEXT: v_mov_b32_e32 v16, s20 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v17, s21 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s59 ; VI-NEXT: v_mov_b32_e32 v35, s58 ; VI-NEXT: v_mov_b32_e32 v10, s57 @@ -25043,9 +25035,13 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v13, s25 ; VI-NEXT: v_mov_b32_e32 v7, s24 ; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v5, s14 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v11, s4 ; VI-NEXT: v_mov_b32_e32 v19, s8 ; VI-NEXT: v_mov_b32_e32 v27, s10 ; VI-NEXT: .LBB85_5: ; %end @@ -25087,8 +25083,8 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s16, 8 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 @@ -25123,13 +25119,13 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: ; implicit-def: $sgpr15 ; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr57 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr4 ; GFX9-NEXT: ; implicit-def: $sgpr25 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr27 @@ -25151,10 +25147,6 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 ; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s59 ; GFX9-NEXT: v_mov_b32_e32 v35, s58 ; GFX9-NEXT: v_mov_b32_e32 v10, s57 @@ -25174,9 +25166,13 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v13, s25 ; GFX9-NEXT: v_mov_b32_e32 v7, s24 ; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-NEXT: v_mov_b32_e32 v19, s8 ; GFX9-NEXT: v_mov_b32_e32 v27, s10 ; GFX9-NEXT: .LBB85_5: ; %end @@ -25335,8 +25331,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 @@ -25355,9 +25351,9 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25380,51 +25376,49 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v23, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -25450,10 +25444,12 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -25558,8 +25554,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v34, v6 ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -25578,9 +25574,9 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25597,25 +25593,18 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -25633,18 +25622,25 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 @@ -25656,14 +25652,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -25716,8 +25712,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 @@ -25736,9 +25732,9 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25755,25 +25751,18 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -25791,18 +25780,25 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 @@ -26352,6 +26348,13 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 @@ -26365,21 +26368,14 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 @@ -26540,6 +26536,9 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -26548,12 +26547,9 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -26696,6 +26692,9 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -26704,12 +26703,9 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 @@ -27045,22 +27041,19 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: v_mov_b32_e32 v30, v14 -; SI-NEXT: v_mov_b32_e32 v29, v13 -; SI-NEXT: v_mov_b32_e32 v28, v12 -; SI-NEXT: v_mov_b32_e32 v27, v11 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_mov_b32_e32 v25, v9 -; SI-NEXT: v_mov_b32_e32 v24, v8 -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v31, v12 +; SI-NEXT: v_mov_b32_e32 v30, v11 +; SI-NEXT: v_mov_b32_e32 v29, v10 +; SI-NEXT: v_mov_b32_e32 v28, v9 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -27075,38 +27068,26 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_4 -; SI-NEXT: .LBB88_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB88_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -27120,25 +27101,32 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: .LBB88_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -27152,10 +27140,14 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v13, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_mov_b32_e32 v15, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16f16: @@ -27784,42 +27776,42 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v6, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_add_f16_e32 v5, s22, v0 -; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_add_f16_e32 v7, s23, v0 -; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v5, v6 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_or_b32_e32 v7, v7, v14 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v15, s22, v0 ; VI-NEXT: v_add_f16_e32 v8, s16, v0 -; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s17, v0 -; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 -; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s19, v0 -; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s20, v0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s21, v0 -; VI-NEXT: v_or_b32_e32 v5, v0, v5 -; VI-NEXT: v_or_b32_e32 v4, v4, v13 -; VI-NEXT: v_or_b32_e32 v3, v3, v12 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: v_add_f16_e32 v10, s18, v0 +; VI-NEXT: v_add_f16_e32 v11, s19, v0 +; VI-NEXT: v_add_f16_e32 v12, s20, v0 +; VI-NEXT: v_add_f16_e32 v13, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v15, v6 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v13, v0 +; VI-NEXT: v_or_b32_e32 v4, v12, v4 +; VI-NEXT: v_or_b32_e32 v3, v11, v3 +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_or_b32_e32 v0, v8, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB91_3: ; VI-NEXT: s_branch .LBB91_2 @@ -28377,23 +28369,36 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -28422,36 +28427,36 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: .LBB94_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: .LBB94_4: ; %cmp.true @@ -28466,21 +28471,21 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 ; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 @@ -28492,16 +28497,16 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -29263,150 +29268,150 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_f32_e32 v7, s4, v0 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_bfe_u32 v12, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v7, v14, v7, 16 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_bfe_u32 v6, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v14 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v10, v2, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v10, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v11, v5 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v13, v14, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_cndmask_b32_e32 v9, v4, v9, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v13, v14 +; VI-NEXT: v_alignbit_b32 v6, v6, v12, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; VI-NEXT: v_bfe_u32 v1, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v12 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v11, v2, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v11, v2 +; VI-NEXT: v_add_f32_e32 v13, s4, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v14, v13 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_bfe_u32 v11, v12, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v12 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v13, v14, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v13, v14 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v14 ; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v15, s4, v0 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_bfe_u32 v12, v15, 16, 1 +; VI-NEXT: v_add_f32_e32 v0, s6, v0 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v15 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v15, v15 +; VI-NEXT: v_bfe_u32 v15, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v15, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v14, s[4:5] +; VI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; VI-NEXT: v_alignbit_b32 v4, v3, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v17, v1, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v9, 16 +; VI-NEXT: v_alignbit_b32 v0, v15, v8, 16 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -29430,151 +29435,151 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; GFX9-NEXT: s_cbranch_execnz .LBB95_4 ; GFX9-NEXT: .LBB95_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v0 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v13, s4, v0 +; GFX9-NEXT: v_bfe_u32 v14, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v13 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff0000 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX9-NEXT: v_and_or_b32 v7, v7, v14, v13 +; GFX9-NEXT: v_add_f32_e32 v13, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_bfe_u32 v6, v13, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v9, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v3, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v10, v4 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v4 +; GFX9-NEXT: v_bfe_u32 v15, v12, 16, 1 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v15, v12 +; GFX9-NEXT: v_and_or_b32 v6, v11, v14, v6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1 +; GFX9-NEXT: v_bfe_u32 v4, v5, 16, 1 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v4, v5 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX9-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v9, v12 +; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v12 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v5, v4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_bfe_u32 v5, v12, 16, 1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_add_f32_e32 v4, s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v5, v12 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v12 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: s_lshl_b32 s5, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: s_lshl_b32 s5, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc -; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 -; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 -; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v12, v9 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_or_b32 v7, v6, v13, v7 -; GFX9-NEXT: v_and_or_b32 v6, v4, v13, v5 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v9 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v14, vcc -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v5 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_or_b32 v5, v4, v13, v5 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v4 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc -; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v4, v4, v13, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: v_and_or_b32 v3, v3, v13, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NEXT: v_and_or_b32 v2, v2, v13, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_and_or_b32 v1, v9, v13, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v13, v8 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc +; GFX9-NEXT: v_bfe_u32 v12, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX9-NEXT: v_and_or_b32 v5, v9, v14, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_or_b32 v3, v1, v14, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-NEXT: v_and_or_b32 v2, v2, v14, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: v_and_or_b32 v1, v10, v14, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-NEXT: v_and_or_b32 v4, v4, v14, v12 +; GFX9-NEXT: v_and_or_b32 v0, v8, v14, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB95_3: ; GFX9-NEXT: s_branch .LBB95_2 @@ -29971,20 +29976,26 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v16i16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr35 @@ -29992,7 +30003,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr24 @@ -30000,7 +30011,6 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr27 @@ -30011,112 +30021,112 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v16, v4 -; VI-NEXT: v_mov_b32_e32 v49, v5 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v16, v33 +; VI-NEXT: v_mov_b32_e32 v48, v34 ; VI-NEXT: v_mov_b32_e32 v24, v6 ; VI-NEXT: v_mov_b32_e32 v51, v7 ; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v48, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; VI-NEXT: v_add_u16_e32 v50, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; VI-NEXT: v_add_u16_e32 v35, 3, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; VI-NEXT: v_add_u16_e32 v8, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; VI-NEXT: v_add_u16_e32 v49, 3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v14, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v35, 3, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_add_u16_sdwa v10, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v35, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v36 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-NEXT: v_add_u16_sdwa v22, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v8, v4 +; VI-NEXT: v_add_u16_e32 v48, 3, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; VI-NEXT: v_add_u16_sdwa v18, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v48, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v33 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; VI-NEXT: v_add_u16_sdwa v30, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v26, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v39, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_u16_e32 v50, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_or_b32_e32 v19, v16, v4 ; VI-NEXT: v_add_u16_e32 v51, 3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 ; VI-NEXT: v_add_u16_e32 v24, 3, v6 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; VI-NEXT: v_or_b32_e32 v1, v48, v1 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; VI-NEXT: v_or_b32_e32 v1, v39, v1 ; VI-NEXT: v_or_b32_e32 v0, v50, v0 -; VI-NEXT: v_or_b32_e32 v3, v35, v3 -; VI-NEXT: v_or_b32_e32 v2, v8, v2 -; VI-NEXT: v_or_b32_e32 v5, v49, v5 -; VI-NEXT: v_or_b32_e32 v4, v16, v4 -; VI-NEXT: v_or_b32_e32 v7, v51, v7 -; VI-NEXT: v_or_b32_e32 v6, v24, v6 +; VI-NEXT: v_or_b32_e32 v7, v51, v4 +; VI-NEXT: v_or_b32_e32 v6, v24, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v32, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v0 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v50 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v2, v32 -; VI-NEXT: v_mov_b32_e32 v3, v33 -; VI-NEXT: v_mov_b32_e32 v4, v48 -; VI-NEXT: v_mov_b32_e32 v5, v37 -; VI-NEXT: v_mov_b32_e32 v6, v36 -; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v1, v49 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v6, v32 +; VI-NEXT: v_mov_b32_e32 v7, v38 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v20, v49 +; VI-NEXT: v_mov_b32_e32 v20, v48 ; VI-NEXT: v_mov_b32_e32 v28, v51 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16i16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -30130,39 +30140,38 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -30172,46 +30181,44 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, v36, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16i16_to_v32i8: @@ -30482,17 +30489,17 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v27, v28, s8, 24 ; SI-NEXT: v_alignbit_b32 v26, v28, s8, 16 ; SI-NEXT: v_alignbit_b32 v25, v28, s8, 8 -; SI-NEXT: s_lshr_b32 s44, s12, 8 -; SI-NEXT: s_lshr_b32 s14, s9, 8 -; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: s_lshr_b32 s43, s12, 8 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_lshr_b32 s40, s10, 8 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: s_and_b32 s45, s19, 0xffff -; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_and_b32 s42, s27, 0xffff +; SI-NEXT: s_and_b32 s44, s19, 0xffff +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_and_b32 s41, s27, 0xffff ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v4 -; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s40, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s43, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s45, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s42, s27, 0x80008 ; SI-NEXT: v_bfe_u32 v31, v4, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true @@ -30551,34 +30558,34 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v27, v28, v0, 24 ; SI-NEXT: v_alignbit_b32 v26, v28, v0, 16 ; SI-NEXT: v_alignbit_b32 v25, v28, v0, 8 -; SI-NEXT: s_lshr_b32 s13, s12, 24 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s12, 8 -; SI-NEXT: s_lshr_b32 s40, s9, 24 -; SI-NEXT: s_lshr_b32 s15, s9, 16 -; SI-NEXT: s_lshr_b32 s14, s9, 8 -; SI-NEXT: s_lshr_b32 s43, s10, 24 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: s_lshr_b32 s45, s12, 24 +; SI-NEXT: s_lshr_b32 s44, s12, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 8 +; SI-NEXT: s_lshr_b32 s15, s9, 24 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_lshr_b32 s42, s10, 24 +; SI-NEXT: s_lshr_b32 s41, s10, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 8 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s44 -; SI-NEXT: v_mov_b32_e32 v6, s45 -; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s44 +; SI-NEXT: v_mov_b32_e32 v7, s45 ; SI-NEXT: v_mov_b32_e32 v8, s6 ; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s15 -; SI-NEXT: v_mov_b32_e32 v15, s40 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 ; SI-NEXT: v_mov_b32_e32 v16, s7 ; SI-NEXT: v_mov_b32_e32 v20, s10 -; SI-NEXT: v_mov_b32_e32 v21, s41 -; SI-NEXT: v_mov_b32_e32 v22, s42 -; SI-NEXT: v_mov_b32_e32 v23, s43 +; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v23, s42 ; SI-NEXT: v_mov_b32_e32 v24, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: @@ -30587,25 +30594,25 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -30826,12 +30833,12 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -30866,24 +30873,20 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: s_branch .LBB97_2 ; GFX9-NEXT: .LBB97_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 ; GFX9-NEXT: v_mov_b32_e32 v35, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v33, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v34, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 @@ -30892,8 +30895,12 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 -; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v19, s8 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 @@ -30903,8 +30910,8 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v20, v17 ; GFX9-NEXT: v_mov_b32_e32 v28, v25 ; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v17, v34 ; GFX9-NEXT: v_mov_b32_e32 v25, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -31053,125 +31060,121 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v41 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v9, 0xff, v14 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v2 -; SI-NEXT: v_or_b32_e32 v2, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v4, v39, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v7, v48, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v5, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v21, v0, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v9, v9, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v8, v50, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v51, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v10, v9, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v29, v0, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_or_b32_e32 v12, v53, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v15, v27, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_or_b32_e32 v14, v13, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v8, v0, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v18, v55, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v18, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v51, v52, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v15, v21, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v32, v32, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v53 +; SI-NEXT: v_or_b32_e32 v13, v23, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_or_b32_e32 v25, v35, v25 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v41, v17, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v34, v39 +; SI-NEXT: v_or_b32_e32 v38, v1, v5 +; SI-NEXT: v_alignbit_b32 v1, v35, v5, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v31, v37, v29 +; SI-NEXT: v_or_b32_e32 v27, v33, v27 +; SI-NEXT: v_or_b32_e32 v33, v7, v15 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v36, v7, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v40, v17 +; SI-NEXT: v_or_b32_e32 v34, v5, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v31, v7, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -31180,21 +31183,30 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v37, v5, v9 +; SI-NEXT: v_alignbit_b32 v5, v34, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, v33, v13, 16 +; SI-NEXT: v_or_b32_e32 v32, v7, v42 +; SI-NEXT: v_alignbit_b32 v13, v31, v42, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -31202,121 +31214,132 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v55, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v1 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_or_b32_e32 v0, v19, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v21, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v35, v38, 16 +; SI-NEXT: v_alignbit_b32 v5, v34, v37, 16 +; SI-NEXT: v_alignbit_b32 v9, v33, v36, 16 +; SI-NEXT: v_alignbit_b32 v13, v31, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v21 -; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v38 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v37 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v10, v33 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 @@ -31329,9 +31352,9 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -31343,34 +31366,27 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB98_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr12 @@ -31383,19 +31399,26 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: v_or_b32_sdwa v4, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: v_or_b32_sdwa v5, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: v_or_b32_sdwa v6, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: v_or_b32_sdwa v7, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -31433,21 +31456,21 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v34 -; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v10, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v31 ; VI-NEXT: v_or_b32_e32 v3, v8, v3 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v4, v8, v4 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 @@ -31466,16 +31489,16 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: v_mov_b32_e32 v34, v6 -; GFX9-NEXT: v_mov_b32_e32 v31, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 @@ -31487,9 +31510,9 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -31501,35 +31524,28 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB98_3: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr8 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr12 @@ -31542,19 +31558,26 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_perm_b32 v2, v4, v3, s6 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s6 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: v_perm_b32 v4, v11, v7, s6 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: v_perm_b32 v5, v19, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: v_perm_b32 v6, v23, v25, s6 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: v_perm_b32 v7, v27, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 @@ -31592,21 +31615,21 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v38, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -32011,127 +32034,125 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v19, v14 -; SI-NEXT: v_mov_b32_e32 v20, v12 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v1 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v13 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s11, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s8, s19, 24 -; SI-NEXT: s_or_b32 s4, s8, s4 -; SI-NEXT: s_and_b32 s8, s28, 0xff -; SI-NEXT: s_lshl_b32 s12, s29, 8 -; SI-NEXT: s_or_b32 s8, s8, s12 -; SI-NEXT: s_and_b32 s12, s6, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s27, 24 +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_and_b32 s12, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s21, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s40, s14, s13 +; SI-NEXT: s_or_b32 s14, s12, s40 +; SI-NEXT: s_and_b32 s12, s28, 0xff +; SI-NEXT: s_lshl_b32 s13, s29, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s6, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: s_or_b32 s41, s15, s12 -; SI-NEXT: s_and_b32 s12, s26, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s15, s27, 24 -; SI-NEXT: s_or_b32 s12, s15, s12 -; SI-NEXT: s_and_b32 s15, s16, 0xff -; SI-NEXT: s_lshl_b32 s40, s17, 8 +; SI-NEXT: s_or_b32 s41, s15, s13 +; SI-NEXT: s_and_b32 s13, s16, 0xff +; SI-NEXT: s_lshl_b32 s15, s17, 8 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_or_b32 s15, s13, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s13, s25, 8 ; SI-NEXT: v_or_b32_e32 v9, v9, v2 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: s_or_b32 s4, s4, s13 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v11, v0, v10 -; SI-NEXT: s_or_b32 s15, s15, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s40, s25, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_or_b32_e32 v10, v9, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v15, v15, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v13, v3, v9 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v22, v7, v17 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_or_b32_e32 v12, v3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 -; SI-NEXT: s_or_b32 s12, s4, s12 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s40, s9, 8 -; SI-NEXT: v_or_b32_e32 v9, v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_or_b32 s4, s4, s40 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v15, v7, v13 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v14, v9, v15 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v18, s4, v12 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s40, s13, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_or_b32 s4, s4, s40 -; SI-NEXT: s_or_b32 s8, s8, s41 -; SI-NEXT: v_or_b32_e32 v22, v17, v9 +; SI-NEXT: v_or_b32_e32 v19, v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v13 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s12, s12, s41 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, s8, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v22, 16 -; SI-NEXT: v_or_b32_e32 v12, s4, v22 -; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, s14, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s12, v5, 16 +; SI-NEXT: v_alignbit_b32 v13, v19, v15, 16 +; SI-NEXT: v_or_b32_e32 v17, s4, v15 +; SI-NEXT: s_lshr_b32 s40, s40, 16 ; SI-NEXT: s_lshr_b32 s41, s41, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -32153,7 +32174,7 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: s_add_i32 s13, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 @@ -32166,7 +32187,7 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -32203,42 +32224,44 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: s_add_i32 s14, s4, 0x3000000 ; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v5, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s14, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_alignbit_b32 v5, s12, v0, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v18, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: v_alignbit_b32 v13, v19, v17, 16 +; SI-NEXT: s_lshr_b32 s40, s14, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; SI-NEXT: .LBB99_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: v_mov_b32_e32 v7, s41 ; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: v_mov_b32_e32 v14, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB99_2 ; @@ -32265,6 +32288,9 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -32273,12 +32299,9 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -32425,15 +32448,19 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff @@ -32451,16 +32478,12 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -32744,23 +32767,36 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -32787,69 +32823,69 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB100_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v31 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_2 ; SI-NEXT: .LBB100_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -33126,42 +33162,42 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v6, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_add_f16_e32 v5, s22, v0 -; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_add_f16_e32 v7, s23, v0 -; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v5, v6 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_or_b32_e32 v7, v7, v14 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v15, s22, v0 ; VI-NEXT: v_add_f16_e32 v8, s16, v0 -; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s17, v0 -; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 -; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s19, v0 -; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s20, v0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s21, v0 -; VI-NEXT: v_or_b32_e32 v5, v0, v5 -; VI-NEXT: v_or_b32_e32 v4, v4, v13 -; VI-NEXT: v_or_b32_e32 v3, v3, v12 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: v_add_f16_e32 v10, s18, v0 +; VI-NEXT: v_add_f16_e32 v11, s19, v0 +; VI-NEXT: v_add_f16_e32 v12, s20, v0 +; VI-NEXT: v_add_f16_e32 v13, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v15, v6 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v13, v0 +; VI-NEXT: v_or_b32_e32 v4, v12, v4 +; VI-NEXT: v_or_b32_e32 v3, v11, v3 +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_or_b32_e32 v0, v8, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -33259,23 +33295,36 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -33303,21 +33352,34 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB102_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -33334,68 +33396,55 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: .LBB102_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 @@ -34184,150 +34233,150 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_f32_e32 v7, s4, v0 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_bfe_u32 v12, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v7, v14, v7, 16 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_bfe_u32 v6, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v14 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v10, v2, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v10, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v11, v5 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v13, v14, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_cndmask_b32_e32 v9, v4, v9, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v13, v14 +; VI-NEXT: v_alignbit_b32 v6, v6, v12, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; VI-NEXT: v_bfe_u32 v1, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v12 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v11, v2, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v11, v2 +; VI-NEXT: v_add_f32_e32 v13, s4, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v14, v13 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_bfe_u32 v11, v12, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v12 +; VI-NEXT: v_add_f32_e32 v14, s4, v0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v13, v14, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v13, v14 +; VI-NEXT: v_add_f32_e32 v12, s4, v0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v14 ; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v15, s4, v0 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_bfe_u32 v12, v15, 16, 1 +; VI-NEXT: v_add_f32_e32 v0, s6, v0 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v15 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v15, v15 +; VI-NEXT: v_bfe_u32 v15, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v15, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v14, s[4:5] +; VI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; VI-NEXT: v_alignbit_b32 v4, v3, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v17, v1, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v9, 16 +; VI-NEXT: v_alignbit_b32 v0, v15, v8, 16 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -34351,159 +34400,159 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; GFX9-NEXT: s_cbranch_execnz .LBB103_4 ; GFX9-NEXT: .LBB103_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v0 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v13, s4, v0 +; GFX9-NEXT: v_bfe_u32 v14, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v13 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v13, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_bfe_u32 v6, v13, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v9, v3 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v10, v4 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v4 +; GFX9-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v7, vcc -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v6, v7, vcc -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_lshl_b32 s5, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 -; GFX9-NEXT: v_bfe_u32 v12, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v7 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: s_lshl_b32 s5, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; GFX9-NEXT: v_add_f32_e32 v12, s5, v1 -; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_and_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v10, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v13, v15, v12 +; GFX9-NEXT: v_lshl_or_b32 v6, v11, 16, v6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; GFX9-NEXT: v_bfe_u32 v4, v10, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v6, v5, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v12 -; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v14, vcc -; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v10 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v4, v15, vcc +; GFX9-NEXT: v_add_u32_e32 v4, v13, v12 +; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v12 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v4, v10, v9 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_bfe_u32 v10, v12, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v4, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v4, v10, v12 +; GFX9-NEXT: v_add_f32_e32 v10, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v12 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GFX9-NEXT: v_and_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v10 -; GFX9-NEXT: v_and_b32_sdwa v10, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_add_u32_e32 v9, v12, v10 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v10, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX9-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v10 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v16, vcc +; GFX9-NEXT: v_bfe_u32 v12, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v16, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GFX9-NEXT: v_and_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v14, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v4, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v16, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v14, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB103_3: ; GFX9-NEXT: s_branch .LBB103_2 @@ -34725,25 +34774,40 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v14 -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v37, v9 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v32, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -34782,22 +34846,22 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v8, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v48, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v35, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v16, v39, v5 +; SI-NEXT: v_or_b32_e32 v12, v39, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v16, v37, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v38, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_or_b32_e32 v20, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_or_b32_e32 v24, v34, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v4, v32, v1 -; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v4, v50, v1 +; SI-NEXT: v_or_b32_e32 v28, v33, v5 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -34818,25 +34882,25 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -34844,9 +34908,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_or_b32_e32 v24, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -34858,33 +34922,33 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v16, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v20, v0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -34927,7 +34991,6 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v34, v4 ; VI-NEXT: v_mov_b32_e32 v33, v3 ; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 @@ -34936,7 +34999,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr9 @@ -34948,7 +35013,6 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr27 @@ -34976,43 +35040,43 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v5, 0x200 -; VI-NEXT: v_add_f16_sdwa v14, v33, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v36, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 -; VI-NEXT: v_add_f16_sdwa v10, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v36, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v14, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v12, v33, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 -; VI-NEXT: v_add_f16_sdwa v22, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 +; VI-NEXT: v_add_f16_sdwa v22, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_add_f16_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v32, v8 +; VI-NEXT: v_add_f16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v8 +; VI-NEXT: v_add_f16_sdwa v10, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 -; VI-NEXT: v_add_f16_sdwa v18, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v30, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v26, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v18, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v30, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v26, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; VI-NEXT: v_or_b32_e32 v9, v35, v8 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v3, v0, v3 -; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v16, v7, v13 -; VI-NEXT: v_or_b32_e32 v15, v6, v5 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; VI-NEXT: v_or_b32_e32 v15, v6, v3 +; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_or_b32_e32 v3, v0, v5 +; VI-NEXT: v_or_b32_e32 v11, v32, v11 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 @@ -35042,16 +35106,19 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16f16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -35065,39 +35132,38 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -35108,46 +35174,44 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v35, v35, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v34, v34, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, v37, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, v36, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v33, v33, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16f16_to_v32i8: @@ -35589,51 +35653,51 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s4, s19, 16 ; VI-NEXT: v_add_f16_e32 v14, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; VI-NEXT: v_add_f16_e32 v34, s19, v1 ; VI-NEXT: v_add_f16_e32 v10, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v12, v34, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; VI-NEXT: v_add_f16_e32 v8, s18, v1 ; VI-NEXT: v_add_f16_e32 v22, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v11, v8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_f16_e32 v33, s21, v1 ; VI-NEXT: v_add_f16_e32 v18, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v20, v33, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; VI-NEXT: v_add_f16_e32 v16, s20, v1 ; VI-NEXT: v_add_f16_e32 v30, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_add_f16_e32 v35, s17, v1 -; VI-NEXT: v_or_b32_e32 v19, v16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 ; VI-NEXT: v_add_f16_e32 v32, s23, v1 ; VI-NEXT: v_add_f16_e32 v26, s4, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v35, s17, v1 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; VI-NEXT: v_add_f16_e32 v34, s19, v1 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; VI-NEXT: v_add_f16_e32 v33, s21, v1 +; VI-NEXT: v_or_b32_e32 v28, v32, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; VI-NEXT: v_add_f16_e32 v24, s22, v1 ; VI-NEXT: v_or_b32_e32 v4, v35, v0 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; VI-NEXT: v_add_f16_e32 v0, s16, v1 -; VI-NEXT: v_or_b32_e32 v37, v32, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; VI-NEXT: v_add_f16_e32 v24, s22, v1 +; VI-NEXT: v_or_b32_e32 v12, v34, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; VI-NEXT: v_or_b32_e32 v20, v33, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; VI-NEXT: v_add_f16_e32 v16, s20, v1 +; VI-NEXT: v_or_b32_e32 v27, v24, v8 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_or_b32_e32 v19, v16, v7 ; VI-NEXT: v_or_b32_e32 v3, v0, v3 -; VI-NEXT: v_or_b32_e32 v36, v24, v5 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[36:37] +; VI-NEXT: v_or_b32_e32 v11, v8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[27:28] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v37 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v36 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 @@ -35693,8 +35757,8 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v9, s27 ; VI-NEXT: v_mov_b32_e32 v13, s15 ; VI-NEXT: v_mov_b32_e32 v1, s26 -; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v19, s8 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 @@ -35759,12 +35823,12 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -35799,24 +35863,20 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: s_branch .LBB105_2 ; GFX9-NEXT: .LBB105_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 ; GFX9-NEXT: v_mov_b32_e32 v35, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v33, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v34, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 @@ -35825,8 +35885,12 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 -; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v19, s8 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 @@ -35836,8 +35900,8 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v20, v17 ; GFX9-NEXT: v_mov_b32_e32 v28, v25 ; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v17, v34 ; GFX9-NEXT: v_mov_b32_e32 v25, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -35986,103 +36050,95 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v34, v4 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v55 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v8, v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v8, v8, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v39 -; SI-NEXT: v_or_b32_e32 v5, v5, v48 -; SI-NEXT: v_or_b32_e32 v6, v6, v49 -; SI-NEXT: v_or_b32_e32 v7, v7, v50 -; SI-NEXT: v_or_b32_e32 v8, v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: v_or_b32_e32 v15, v15, v53 +; SI-NEXT: v_or_b32_e32 v29, v29, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v40, v31, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v41, v32, v21 +; SI-NEXT: v_or_b32_e32 v33, v33, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v34, v25 +; SI-NEXT: v_or_b32_e32 v27, v35, v27 +; SI-NEXT: v_or_b32_e32 v54, v36, v54 +; SI-NEXT: v_or_b32_e32 v55, v37, v55 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -36095,19 +36151,35 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -36115,108 +36187,117 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v10, v21 -; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v31 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v32 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v10, v35 +; SI-NEXT: v_mov_b32_e32 v12, v33 ; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 @@ -36229,9 +36310,9 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36243,34 +36324,27 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB106_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr12 @@ -36283,19 +36357,26 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: v_or_b32_sdwa v4, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: v_or_b32_sdwa v5, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: v_or_b32_sdwa v6, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: v_or_b32_sdwa v7, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -36333,21 +36414,21 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v34 -; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v10, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v31 ; VI-NEXT: v_or_b32_e32 v3, v8, v3 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v4, v8, v4 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 @@ -36366,16 +36447,16 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: v_mov_b32_e32 v34, v6 -; GFX9-NEXT: v_mov_b32_e32 v31, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 @@ -36387,9 +36468,9 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36401,35 +36482,28 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB106_3: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr8 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr12 @@ -36442,19 +36516,26 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_perm_b32 v2, v4, v3, s6 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s6 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: v_perm_b32 v4, v11, v7, s6 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: v_perm_b32 v5, v19, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: v_perm_b32 v6, v23, v25, s6 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: v_perm_b32 v7, v27, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 @@ -36492,21 +36573,21 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v38, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -37138,6 +37219,9 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -37146,12 +37230,9 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -37298,15 +37379,19 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff @@ -37324,16 +37409,12 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -37617,23 +37698,40 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v31, v14 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v31 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -37676,22 +37774,22 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB108_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: v_alignbit_b32 v8, v5, v48, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 -; SI-NEXT: v_alignbit_b32 v16, v5, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_alignbit_b32 v8, v5, v49, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_alignbit_b32 v16, v5, v37, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v34 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 -; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v34, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v38, 16 -; SI-NEXT: v_alignbit_b32 v20, v22, v50, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_alignbit_b32 v0, v0, v53, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v51, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v39, 16 +; SI-NEXT: v_alignbit_b32 v20, v22, v35, 16 ; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 -; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v33, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -37704,30 +37802,30 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 ; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 ; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v38 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v34 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: .LBB108_4: ; %cmp.true @@ -37737,44 +37835,44 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 ; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 @@ -37805,16 +37903,19 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v16bf16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 ; VI-NEXT: v_mov_b32_e32 v35, v3 ; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr13 @@ -37828,240 +37929,240 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v35, v12, v11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v36, v12, v11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_bfe_u32 v9, v2, 16, 1 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v8, v1 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v10, v6, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v10, v6 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; VI-NEXT: v_bfe_u32 v16, v0, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v35, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v34, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v33, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v32, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_alignbit_b32 v32, v12, v11, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 ; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v16, v36 +; VI-NEXT: v_mov_b32_e32 v20, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 ; GFX9-NEXT: v_mov_b32_e32 v35, v3 ; GFX9-NEXT: v_mov_b32_e32 v34, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr13 @@ -38075,210 +38176,207 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[36:37] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: .LBB108_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; GFX9-NEXT: s_movk_i32 s9, 0x7fff +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s9 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s9 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: s_mov_b32 s8, 0x7060302 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: v_perm_b32 v3, v0, v5, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; GFX9-NEXT: v_perm_b32 v3, v0, v2, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v37 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v0, s9 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v10, v11, vcc -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; GFX9-NEXT: v_perm_b32 v11, v9, v13, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v10, v14, vcc -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v10, v15, vcc -; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v32 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s9 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v8, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; GFX9-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_add3_u32 v10, v10, v7, s9 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GFX9-NEXT: v_add3_u32 v7, v15, v13, s9 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v7, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; GFX9-NEXT: v_add3_u32 v13, v13, v7, s9 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v16, vcc +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_add3_u32 v13, v17, v15, s9 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v4, v8, v2, s7 -; GFX9-NEXT: v_perm_b32 v12, v1, v0, s7 -; GFX9-NEXT: v_perm_b32 v10, v17, v14, s7 -; GFX9-NEXT: v_perm_b32 v9, v9, v18, s7 -; GFX9-NEXT: v_perm_b32 v16, v7, v19, s7 -; GFX9-NEXT: v_perm_b32 v15, v6, v20, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v35 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v13, v16, vcc +; GFX9-NEXT: v_add3_u32 v11, v11, v9, s9 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v18, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; GFX9-NEXT: v_perm_b32 v11, v15, v7, s8 +; GFX9-NEXT: v_add3_u32 v15, v17, v1, s9 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v32 +; GFX9-NEXT: v_bfe_u32 v16, v13, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v20, vcc +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v21 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v5, v19, vcc +; GFX9-NEXT: v_add3_u32 v16, v16, v13, s9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add3_u32 v13, v22, v21, s9 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v21, v21 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v25 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v24, s[4:5] +; GFX9-NEXT: v_add3_u32 v19, v26, v25, s9 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v25, v25 +; GFX9-NEXT: v_perm_b32 v6, v8, v0, s8 +; GFX9-NEXT: v_bfe_u32 v23, v5, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v19, v4, s[4:5] +; GFX9-NEXT: v_add3_u32 v4, v18, v17, s9 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v17, v17 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX9-NEXT: v_perm_b32 v19, v0, v13, s8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v22, s[4:5] +; GFX9-NEXT: v_add3_u32 v4, v23, v5, s9 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX9-NEXT: v_perm_b32 v20, v14, v10, s8 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v21, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v16, v16, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_perm_b32 v5, v4, v0, s8 +; GFX9-NEXT: v_perm_b32 v12, v16, v12, s8 +; GFX9-NEXT: v_perm_b32 v4, v38, v9, s8 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v3 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v38 ; GFX9-NEXT: .LBB108_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 ; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v16, v36 +; GFX9-NEXT: v_mov_b32_e32 v20, v37 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8: @@ -39182,13 +39280,9 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 ; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 ; VI-NEXT: v_mov_b32_e32 v35, s59 ; VI-NEXT: v_mov_b32_e32 v2, s57 ; VI-NEXT: v_mov_b32_e32 v5, s58 @@ -39208,8 +39302,12 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v26, s24 ; VI-NEXT: v_mov_b32_e32 v29, s25 ; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: v_mov_b32_e32 v19, s8 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 @@ -39245,9 +39343,9 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s27, s19, 8 ; GFX9-NEXT: s_lshr_b32 s40, s18, 16 ; GFX9-NEXT: s_lshr_b32 s29, s18, 8 -; GFX9-NEXT: s_lshr_b32 s14, s17, 24 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 ; GFX9-NEXT: s_lshr_b32 s56, s17, 16 -; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 ; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: s_lshr_b32 s25, s16, 8 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 @@ -39257,194 +39355,194 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: s_cbranch_execnz .LBB109_4 ; GFX9-NEXT: .LBB109_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v6 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v1, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v6 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v1 -; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v10, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v6 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 -; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v10 +; GFX9-NEXT: v_add_f32_e32 v13, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 -; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v20, s4, v6 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v22, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 -; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GFX9-NEXT: v_add_f32_e32 v11, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc -; GFX9-NEXT: v_add_f32_e32 v9, s4, v5 -; GFX9-NEXT: v_bfe_u32 v11, v9, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v9 -; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v6 ; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v7, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 -; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v16, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 -; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 -; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v24, s4, v6 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v18 +; GFX9-NEXT: v_bfe_u32 v22, v20, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; GFX9-NEXT: v_bfe_u32 v27, v24, 16, 1 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GFX9-NEXT: v_add_f32_e32 v11, s4, v5 -; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v11 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v14, v13, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; GFX9-NEXT: v_add_f32_e32 v29, s4, v6 ; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc +; GFX9-NEXT: v_add_u32_e32 v22, v22, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_u32_e32 v20, v27, v24 +; GFX9-NEXT: v_bfe_u32 v19, v17, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v31, s4, v6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v24 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v24, v24 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v13 +; GFX9-NEXT: v_bfe_u32 v26, v16, 16, 1 +; GFX9-NEXT: v_bfe_u32 v23, v31, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v20, v22, v25, vcc +; GFX9-NEXT: v_add_u32_e32 v19, v19, v17 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v13 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v13, v13 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v17 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v31 +; GFX9-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_u32_e32 v25, v26, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v22, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v16 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v31 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_cmp_u_f32_e64 s[6:7], v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v19, v28, vcc +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v14, v36, v29 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v27, s[6:7] +; GFX9-NEXT: v_lshl_or_b32 v20, v22, 16, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v30, vcc +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v36, s4, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v31, vcc +; GFX9-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v27, v6, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v9 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v9 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v25, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v17, v11 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v15, vcc +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v12, v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v11 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX9-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v19, v4, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v4, v39, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v36 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v28, v30, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v37, v6, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v36, v1, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v36 ; GFX9-NEXT: s_branch .LBB109_5 ; GFX9-NEXT: .LBB109_3: ; GFX9-NEXT: ; implicit-def: $sgpr25 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr4 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr15 ; GFX9-NEXT: ; implicit-def: $sgpr29 ; GFX9-NEXT: ; implicit-def: $sgpr40 ; GFX9-NEXT: ; implicit-def: $sgpr6 @@ -39491,11 +39589,11 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s27 ; GFX9-NEXT: v_mov_b32_e32 v2, s26 ; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_mov_b32_e32 v27, s10 ; GFX9-NEXT: v_mov_b32_e32 v19, s8 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: .LBB109_5: ; %end ; GFX9-NEXT: v_mov_b32_e32 v4, v35 @@ -39800,99 +39898,99 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v33 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: v_or_b32_e32 v31, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v39, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v7, v49, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v17 -; SI-NEXT: v_or_b32_e32 v23, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v27, v50, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v11, v21, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v25 -; SI-NEXT: v_or_b32_e32 v32, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v13, v52, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v2, v2, v55 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v15, v54, v2 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v17 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v34, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v48, v39, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v9 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v33, v51, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v39, v19, v11 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v41, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v38, v31, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v13, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v35, v35, v54 +; SI-NEXT: v_or_b32_e32 v40, v41, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 +; SI-NEXT: v_or_b32_e32 v13, v23, v13 +; SI-NEXT: v_or_b32_e32 v23, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -39908,160 +40006,178 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_or_b32_e32 v7, v53, v7 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v15, v27, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v31 -; SI-NEXT: v_mov_b32_e32 v6, v19 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v9, v27 -; SI-NEXT: v_mov_b32_e32 v10, v29 -; SI-NEXT: v_mov_b32_e32 v12, v32 -; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v2, v37 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: v_mov_b32_e32 v6, v31 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v39 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16bf16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 @@ -40074,9 +40190,9 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40088,34 +40204,27 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB110_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr12 @@ -40128,19 +40237,26 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: v_or_b32_sdwa v4, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: v_or_b32_sdwa v5, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: v_or_b32_sdwa v6, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: v_or_b32_sdwa v7, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -40178,21 +40294,21 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v34 -; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v10, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v31 ; VI-NEXT: v_or_b32_e32 v3, v8, v3 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v4, v8, v4 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 @@ -40211,16 +40327,16 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: v_mov_b32_e32 v34, v6 -; GFX9-NEXT: v_mov_b32_e32 v31, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 @@ -40232,9 +40348,9 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40246,35 +40362,28 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB110_3: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr8 ; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr12 @@ -40287,19 +40396,26 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_perm_b32 v2, v4, v3, s6 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s6 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: v_perm_b32 v4, v11, v7, s6 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: v_perm_b32 v5, v19, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: v_perm_b32 v6, v23, v25, s6 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: v_perm_b32 v7, v27, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 @@ -40337,21 +40453,21 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v38, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -41006,6 +41122,9 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -41014,12 +41133,9 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -41166,15 +41282,19 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff @@ -41192,16 +41312,12 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 6cf53d187fcab..a0db69d1a5dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -825,16 +825,20 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v1 ; SI-NEXT: v_mov_b32_e32 v23, v8 ; SI-NEXT: v_mov_b32_e32 v22, v6 ; SI-NEXT: v_mov_b32_e32 v21, v4 ; SI-NEXT: v_mov_b32_e32 v20, v2 ; SI-NEXT: v_mov_b32_e32 v19, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -861,30 +865,30 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -1286,24 +1290,22 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v9i32_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v18, v8 ; SI-NEXT: v_mov_b32_e32 v24, v6 ; SI-NEXT: v_mov_b32_e32 v23, v5 ; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 ; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v18, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -1316,90 +1318,89 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v26 +; SI-NEXT: v_mov_b32_e32 v3, v25 +; SI-NEXT: v_mov_b32_e32 v7, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: @@ -1690,25 +1691,34 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1720,91 +1730,91 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -1944,8 +1954,8 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 @@ -1965,7 +1975,7 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 @@ -1973,7 +1983,7 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v2, v21, v2 ; SI-NEXT: v_or_b32_e32 v3, v19, v3 ; SI-NEXT: v_or_b32_e32 v4, v17, v4 @@ -1984,49 +1994,49 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 @@ -2558,16 +2568,20 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v1 ; SI-NEXT: v_mov_b32_e32 v23, v8 ; SI-NEXT: v_mov_b32_e32 v22, v6 ; SI-NEXT: v_mov_b32_e32 v21, v4 ; SI-NEXT: v_mov_b32_e32 v20, v2 ; SI-NEXT: v_mov_b32_e32 v19, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -2594,30 +2608,30 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -3019,24 +3033,22 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v9f32_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v18, v8 ; SI-NEXT: v_mov_b32_e32 v24, v6 ; SI-NEXT: v_mov_b32_e32 v23, v5 ; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 ; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v18, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -3049,90 +3061,89 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v26 +; SI-NEXT: v_mov_b32_e32 v3, v25 +; SI-NEXT: v_mov_b32_e32 v7, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: @@ -3441,25 +3452,34 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3471,91 +3491,91 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -3695,8 +3715,8 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 @@ -3716,7 +3736,7 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 @@ -3724,7 +3744,7 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v2, v21, v2 ; SI-NEXT: v_or_b32_e32 v3, v19, v3 ; SI-NEXT: v_or_b32_e32 v4, v17, v4 @@ -3735,49 +3755,49 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 @@ -3968,24 +3988,21 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v17 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v33, v15 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v31, v13 -; SI-NEXT: v_mov_b32_e32 v30, v12 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v5 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v30, v9 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v27, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -4002,40 +4019,28 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -4051,27 +4056,34 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -4087,10 +4099,14 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v15, v18 +; SI-NEXT: v_mov_b32_e32 v16, v19 +; SI-NEXT: v_mov_b32_e32 v17, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: @@ -4777,52 +4793,52 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s24, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_add_f16_e32 v3, s24, v0 ; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: s_lshr_b32 s4, s22, 16 ; VI-NEXT: v_or_b32_e32 v8, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 ; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v6, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v5, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_e32 v12, s18, v0 ; VI-NEXT: v_add_f16_e32 v9, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_or_b32_e32 v0, v9, v10 +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: v_or_b32_e32 v2, v12, v2 +; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v0 +; VI-NEXT: v_or_b32_e32 v0, v9, v12 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 436b1a038b274..4ce7deef62786 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -870,17 +870,22 @@ define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v29, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v25, v8 ; SI-NEXT: v_mov_b32_e32 v24, v6 ; SI-NEXT: v_mov_b32_e32 v23, v4 ; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_mov_b32_e32 v21, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -907,35 +912,35 @@ define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 @@ -1363,27 +1368,24 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v10i32_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -1396,98 +1398,98 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v1 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v3, v28 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: v_mov_b32_e32 v9, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v20f16: @@ -1797,27 +1799,37 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v30, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1829,53 +1841,53 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v27, v3 -; SI-NEXT: v_or_b32_e32 v4, v25, v4 -; SI-NEXT: v_or_b32_e32 v5, v23, v5 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1884,25 +1896,25 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1910,11 +1922,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1922,7 +1934,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 @@ -2600,46 +2612,46 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -2663,7 +2675,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_4 @@ -2672,18 +2684,18 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -2707,14 +2719,14 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2724,7 +2736,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2738,7 +2750,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2752,7 +2764,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2766,14 +2778,14 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2786,46 +2798,46 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -2849,7 +2861,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 @@ -2858,18 +2870,18 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 ; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -2893,14 +2905,14 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2909,7 +2921,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2921,7 +2933,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2933,7 +2945,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2945,13 +2957,13 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5629,79 +5641,79 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_or_b32_e32 v0, v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: v_or_b32_e32 v0, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v0, v2, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: v_or_b32_e32 v0, v2, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v3, v39, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_or_b32_e32 v3, s7, v3 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -5855,30 +5867,6 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -5899,16 +5887,40 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v3, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -6036,30 +6048,6 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -6080,16 +6068,40 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -7801,17 +7813,22 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v29, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v25, v8 ; SI-NEXT: v_mov_b32_e32 v24, v6 ; SI-NEXT: v_mov_b32_e32 v23, v4 ; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_mov_b32_e32 v21, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -7838,35 +7855,35 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 @@ -8294,27 +8311,24 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v10f32_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -8327,98 +8341,98 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_4 -; SI-NEXT: .LBB28_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB28_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_2 -; SI-NEXT: .LBB28_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v1 +; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v3, v28 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: v_mov_b32_e32 v9, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v20f16: @@ -8743,27 +8757,37 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v30, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8775,53 +8799,53 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v27, v3 -; SI-NEXT: v_or_b32_e32 v4, v25, v4 -; SI-NEXT: v_or_b32_e32 v5, v23, v5 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8830,25 +8854,25 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -8856,11 +8880,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -8868,7 +8892,7 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 @@ -9546,46 +9570,46 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -9609,7 +9633,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_4 @@ -9618,18 +9642,18 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -9653,14 +9677,14 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -9670,7 +9694,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9684,7 +9708,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9698,7 +9722,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9712,14 +9736,14 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9732,46 +9756,46 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -9795,7 +9819,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_4 @@ -9804,18 +9828,18 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 ; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 ; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -9839,14 +9863,14 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -9855,7 +9879,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9867,7 +9891,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9879,7 +9903,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9891,13 +9915,13 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10624,17 +10648,17 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 ; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 @@ -10643,8 +10667,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; VI-NEXT: s_branch .LBB33_5 ; VI-NEXT: .LBB33_3: ; VI-NEXT: ; implicit-def: $sgpr76 @@ -10680,17 +10704,12 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: s_branch .LBB33_2 ; VI-NEXT: .LBB33_4: ; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: v_mov_b32_e32 v10, s17 ; VI-NEXT: v_mov_b32_e32 v7, s18 -; VI-NEXT: v_mov_b32_e32 v8, s19 ; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_mov_b32_e32 v3, s22 -; VI-NEXT: v_mov_b32_e32 v4, s23 ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: v_mov_b32_e32 v2, s25 -; VI-NEXT: v_mov_b32_e32 v39, s76 -; VI-NEXT: v_mov_b32_e32 v48, s74 +; VI-NEXT: v_mov_b32_e32 v48, s76 +; VI-NEXT: v_mov_b32_e32 v39, s74 ; VI-NEXT: v_mov_b32_e32 v38, s75 ; VI-NEXT: v_mov_b32_e32 v36, s73 ; VI-NEXT: v_mov_b32_e32 v37, s72 @@ -10699,17 +10718,17 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v33, s62 ; VI-NEXT: v_mov_b32_e32 v31, s60 ; VI-NEXT: v_mov_b32_e32 v32, s59 -; VI-NEXT: v_mov_b32_e32 v30, s58 -; VI-NEXT: v_mov_b32_e32 v29, s56 +; VI-NEXT: v_mov_b32_e32 v29, s58 +; VI-NEXT: v_mov_b32_e32 v30, s56 ; VI-NEXT: v_mov_b32_e32 v28, s57 ; VI-NEXT: v_mov_b32_e32 v26, s47 ; VI-NEXT: v_mov_b32_e32 v27, s46 ; VI-NEXT: v_mov_b32_e32 v25, s45 ; VI-NEXT: v_mov_b32_e32 v24, s43 ; VI-NEXT: v_mov_b32_e32 v23, s44 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v20, s42 +; VI-NEXT: v_mov_b32_e32 v21, s41 +; VI-NEXT: v_mov_b32_e32 v22, s40 ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_mov_b32_e32 v16, s27 @@ -10719,11 +10738,16 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s8 ; VI-NEXT: v_mov_b32_e32 v12, s10 ; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v2, s25 ; VI-NEXT: .LBB33_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; VI-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -10747,10 +10771,10 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v8, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen @@ -10770,12 +10794,12 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; VI-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 ; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -10849,17 +10873,17 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 @@ -10868,8 +10892,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB33_5 ; GFX9-NEXT: .LBB33_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -10905,17 +10929,12 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: s_branch .LBB33_2 ; GFX9-NEXT: .LBB33_4: ; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-NEXT: v_mov_b32_e32 v7, s18 -; GFX9-NEXT: v_mov_b32_e32 v8, s19 ; GFX9-NEXT: v_mov_b32_e32 v5, s20 -; GFX9-NEXT: v_mov_b32_e32 v6, s21 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -10924,17 +10943,17 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v33, s62 ; GFX9-NEXT: v_mov_b32_e32 v31, s60 ; GFX9-NEXT: v_mov_b32_e32 v32, s59 -; GFX9-NEXT: v_mov_b32_e32 v30, s58 -; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v29, s58 +; GFX9-NEXT: v_mov_b32_e32 v30, s56 ; GFX9-NEXT: v_mov_b32_e32 v28, s57 ; GFX9-NEXT: v_mov_b32_e32 v26, s47 ; GFX9-NEXT: v_mov_b32_e32 v27, s46 ; GFX9-NEXT: v_mov_b32_e32 v25, s45 ; GFX9-NEXT: v_mov_b32_e32 v24, s43 ; GFX9-NEXT: v_mov_b32_e32 v23, s44 -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v22, s41 -; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v22, s40 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v16, s27 @@ -10944,11 +10963,16 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s8 ; GFX9-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 ; GFX9-NEXT: .LBB33_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -10969,10 +10993,10 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 @@ -10989,11 +11013,11 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12589,79 +12613,79 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_or_b32_e32 v0, v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: v_or_b32_e32 v0, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v0, v2, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: v_or_b32_e32 v0, v2, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v3, v39, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_or_b32_e32 v3, s7, v3 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -12815,30 +12839,6 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -12859,16 +12859,40 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v3, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 ; VI-NEXT: s_cbranch_execnz .LBB35_3 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -12996,30 +13020,6 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -13040,16 +13040,40 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX9-NEXT: s_cbranch_execnz .LBB35_3 ; GFX9-NEXT: .LBB35_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -14449,26 +14473,23 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v19 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v37, v17 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v35, v15 -; SI-NEXT: v_mov_b32_e32 v34, v14 -; SI-NEXT: v_mov_b32_e32 v33, v13 -; SI-NEXT: v_mov_b32_e32 v32, v12 -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v48, v0 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v29, v6 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -14487,42 +14508,30 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -14540,29 +14549,36 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -14580,10 +14596,14 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: v_mov_b32_e32 v18, v21 +; SI-NEXT: v_mov_b32_e32 v19, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i16_to_v20f16: @@ -14605,8 +14625,8 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v17, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v19, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_add_u16_e32 v20, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 @@ -14616,7 +14636,7 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 ; VI-NEXT: v_or_b32_e32 v8, v8, v19 ; VI-NEXT: v_or_b32_e32 v7, v7, v18 ; VI-NEXT: v_or_b32_e32 v6, v6, v17 @@ -14994,50 +15014,48 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -15047,35 +15065,37 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 ; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 ; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 ; SI-NEXT: .LBB46_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -15091,28 +15111,28 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_e32 v10, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v12, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v14, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v7 -; VI-NEXT: v_add_f16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v11, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v11 -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_or_b32_e32 v7, v18, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_or_b32_e32 v7, v18, v7 ; VI-NEXT: v_or_b32_e32 v6, v17, v6 ; VI-NEXT: v_or_b32_e32 v5, v16, v5 ; VI-NEXT: v_or_b32_e32 v4, v15, v4 @@ -15198,7 +15218,7 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 @@ -15219,6 +15239,7 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -15227,50 +15248,48 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -15280,35 +15299,37 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 ; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 ; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -15322,57 +15343,57 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s25, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_add_f16_e32 v3, s25, v0 ; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: s_lshr_b32 s4, s23, 16 ; VI-NEXT: v_or_b32_e32 v9, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 ; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v8, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v6, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v5, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v11, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_e32 v13, s18, v0 ; VI-NEXT: v_add_f16_e32 v10, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_or_b32_e32 v0, v10, v11 +; VI-NEXT: v_add_f16_e32 v11, s17, v0 +; VI-NEXT: v_or_b32_e32 v2, v13, v2 +; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v11, v0 +; VI-NEXT: v_or_b32_e32 v0, v10, v13 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -16100,46 +16121,46 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -16163,7 +16184,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_4 @@ -16172,18 +16193,18 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -16207,14 +16228,14 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -16223,7 +16244,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16235,7 +16256,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16247,7 +16268,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16259,13 +16280,13 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -17338,17 +17359,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 @@ -17357,8 +17378,8 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB49_5 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -17394,17 +17415,12 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: s_branch .LBB49_2 ; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-NEXT: v_mov_b32_e32 v7, s18 -; GFX9-NEXT: v_mov_b32_e32 v8, s19 ; GFX9-NEXT: v_mov_b32_e32 v5, s20 -; GFX9-NEXT: v_mov_b32_e32 v6, s21 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -17413,17 +17429,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v33, s62 ; GFX9-NEXT: v_mov_b32_e32 v31, s60 ; GFX9-NEXT: v_mov_b32_e32 v32, s59 -; GFX9-NEXT: v_mov_b32_e32 v30, s58 -; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v29, s58 +; GFX9-NEXT: v_mov_b32_e32 v30, s56 ; GFX9-NEXT: v_mov_b32_e32 v28, s57 ; GFX9-NEXT: v_mov_b32_e32 v26, s47 ; GFX9-NEXT: v_mov_b32_e32 v27, s46 ; GFX9-NEXT: v_mov_b32_e32 v25, s45 ; GFX9-NEXT: v_mov_b32_e32 v24, s43 ; GFX9-NEXT: v_mov_b32_e32 v23, s44 -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v22, s41 -; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v22, s40 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v16, s27 @@ -17433,11 +17449,16 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v13, s8 ; GFX9-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 ; GFX9-NEXT: .LBB49_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -17458,10 +17479,10 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 @@ -17478,11 +17499,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19392,33 +19413,33 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v3, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -19552,58 +19573,58 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v5, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s29, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v9, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -19978,32 +19999,32 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v29 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -20031,8 +20052,8 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v33, v3 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 @@ -20189,18 +20210,18 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v9, v0, v19 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -20442,17 +20463,15 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v5f64_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v9 -; SI-NEXT: v_mov_b32_e32 v27, v8 -; SI-NEXT: v_mov_b32_e32 v26, v7 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_mov_b32_e32 v24, v5 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -20465,47 +20484,48 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v26, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v26, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v21 -; SI-NEXT: v_mov_b32_e32 v6, v22 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v10, v24 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v26 -; SI-NEXT: v_mov_b32_e32 v16, v27 -; SI-NEXT: v_mov_b32_e32 v18, v28 -; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v8, v22 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: v_mov_b32_e32 v10, v23 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: v_mov_b32_e32 v1, v26 +; SI-NEXT: v_mov_b32_e32 v3, v18 +; SI-NEXT: v_mov_b32_e32 v18, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16: @@ -20587,9 +20607,9 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v20, s25, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v21, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v18, s23, v0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s20 ; SI-NEXT: v_alignbit_b32 v22, s21, v0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s18 @@ -20606,14 +20626,14 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 -; SI-NEXT: v_alignbit_b32 v20, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v21, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v12, 16 ; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 ; SI-NEXT: v_alignbit_b32 v23, v5, v4, 16 ; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 @@ -20626,9 +20646,9 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB55_2 ; SI-NEXT: .LBB55_4: @@ -20636,8 +20656,8 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v5, s19 ; SI-NEXT: v_mov_b32_e32 v9, s21 ; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_mov_b32_e32 v12, s22 ; SI-NEXT: v_mov_b32_e32 v8, s20 ; SI-NEXT: v_mov_b32_e32 v4, s18 @@ -20651,13 +20671,13 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v6, v5 ; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v16, v20 ; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v18, v17 ; SI-NEXT: v_mov_b32_e32 v1, v24 ; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: v_mov_b32_e32 v9, v22 -; SI-NEXT: v_mov_b32_e32 v13, v21 -; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: v_mov_b32_e32 v13, v18 +; SI-NEXT: v_mov_b32_e32 v18, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16_scalar: @@ -20824,32 +20844,32 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v29 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -20877,8 +20897,8 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v33, v3 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 @@ -21035,18 +21055,18 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v9, v0, v19 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -22010,49 +22030,48 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v20f16_to_v40i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB60_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 @@ -22064,140 +22083,141 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 ; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v1 ; VI-NEXT: .LBB60_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB60_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 -; VI-NEXT: v_add_f16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; VI-NEXT: v_add_f16_sdwa v24, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v39, v2, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 +; VI-NEXT: v_add_f16_sdwa v27, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v51, v2, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v27 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v38, v1, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_add_f16_sdwa v22, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v50, v1, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v36, v4, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; VI-NEXT: v_add_f16_sdwa v26, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v4, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v35, v3, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; VI-NEXT: v_add_f16_sdwa v20, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v33, v6, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; VI-NEXT: v_add_f16_sdwa v25, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v53, v6, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v32, v5, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; VI-NEXT: v_add_f16_sdwa v19, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v52, v5, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v8, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; VI-NEXT: v_add_f16_sdwa v23, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_add_f16_sdwa v17, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v7, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; VI-NEXT: v_add_f16_sdwa v18, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v21, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v7, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 ; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v49, v10, v12 -; VI-NEXT: v_or_b32_e32 v48, v9, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[48:49] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[29:30] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[35:36] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[38:39] -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v49 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v48 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v38 -; VI-NEXT: v_bfe_u32 v29, v17, 8, 8 -; VI-NEXT: v_bfe_u32 v32, v18, 8, 8 -; VI-NEXT: v_bfe_u32 v35, v19, 8, 8 -; VI-NEXT: v_bfe_u32 v38, v21, 8, 8 -; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 +; VI-NEXT: v_or_b32_e32 v13, v10, v12 +; VI-NEXT: v_or_b32_e32 v12, v9, v11 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[16:17] +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v17 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[50:51] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v50 +; VI-NEXT: v_bfe_u32 v11, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v32, v19, 8, 8 +; VI-NEXT: v_bfe_u32 v35, v20, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v22, 8, 8 +; VI-NEXT: v_bfe_u32 v48, v24, 8, 8 ; VI-NEXT: .LBB60_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v49 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48 -; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -22208,46 +22228,46 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB60_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -22271,7 +22291,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB60_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB60_4 @@ -22281,18 +22301,18 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] ; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] ; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -22316,14 +22336,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB60_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -22332,7 +22352,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22344,7 +22364,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22356,7 +22376,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22368,13 +22388,13 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22803,7 +22823,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 @@ -22811,8 +22831,9 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v53, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v50, s26 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v43, s29 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -22820,16 +22841,15 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_or_b32_e32 v28, v15, v3 +; SI-NEXT: v_or_b32_e32 v27, v15, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v24, v12, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_or_b32_e32 v14, v33, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v13, v20, v3 @@ -22837,34 +22857,34 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v7, v53, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v11, v50, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_or_b32_e32 v5, v44, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v43, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v6, v41, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 ; SI-NEXT: v_or_b32_e32 v4, v46, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_or_b32_e32 v3, v45, v3 -; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 -; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v30, v24, v27, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v27, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v27, 8 +; SI-NEXT: v_alignbit_b32 v28, v13, v14, 24 ; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 -; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 -; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v25, v11, v7, 16 ; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 ; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 8 ; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 ; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v23, v3, v4, 8 ; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 ; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v3 ; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 ; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 ; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 @@ -22886,10 +22906,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v4, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 @@ -22919,7 +22939,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 ; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -22948,37 +22968,37 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v14, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_or_b32_e32 v28, v15, v16 +; SI-NEXT: v_or_b32_e32 v27, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 ; SI-NEXT: v_or_b32_e32 v13, v17, v13 ; SI-NEXT: v_or_b32_e32 v24, v12, v15 -; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 -; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v30, v24, v27, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v27, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v27, 8 +; SI-NEXT: v_alignbit_b32 v28, v13, v14, 24 ; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 -; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 -; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v25, v11, v7, 16 ; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 ; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 8 ; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 ; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v23, v3, v4, 8 ; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 ; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v3 ; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 ; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 ; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 ; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 ; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v37 ; SI-NEXT: v_or_b32_e32 v12, v12, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v35 @@ -23006,7 +23026,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 @@ -23028,9 +23048,9 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v21 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 @@ -23050,9 +23070,9 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v19 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -23062,7 +23082,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v39 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -23074,7 +23094,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -23086,7 +23106,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -23107,7 +23127,7 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -23117,30 +23137,30 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB61_2 ; @@ -23186,73 +23206,73 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v1, 0x200 ; VI-NEXT: v_add_f16_e32 v8, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; VI-NEXT: v_add_f16_e32 v17, s17, v1 ; VI-NEXT: v_add_f16_e32 v12, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v39, v17, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: v_add_f16_e32 v22, s16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_add_f16_e32 v17, s17, v1 ; VI-NEXT: v_add_f16_e32 v9, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v38, v22, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; VI-NEXT: v_add_f16_e32 v18, s19, v1 +; VI-NEXT: v_or_b32_e32 v51, v17, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v22, s16, v1 ; VI-NEXT: v_add_f16_e32 v13, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v36, v18, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; VI-NEXT: v_add_f16_e32 v23, s18, v1 +; VI-NEXT: v_or_b32_e32 v50, v22, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_add_f16_e32 v18, s19, v1 ; VI-NEXT: v_add_f16_e32 v10, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v35, v23, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; VI-NEXT: v_add_f16_e32 v19, s21, v1 +; VI-NEXT: v_or_b32_e32 v6, v18, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v23, s18, v1 ; VI-NEXT: v_add_f16_e32 v14, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v33, v19, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_add_f16_e32 v24, s20, v1 +; VI-NEXT: v_or_b32_e32 v5, v23, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_add_f16_e32 v19, s21, v1 ; VI-NEXT: v_add_f16_e32 v11, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v32, v24, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; VI-NEXT: v_add_f16_e32 v20, s23, v1 +; VI-NEXT: v_or_b32_e32 v53, v19, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v24, s20, v1 ; VI-NEXT: v_add_f16_e32 v15, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v30, v20, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_add_f16_e32 v25, s22, v1 +; VI-NEXT: v_or_b32_e32 v52, v24, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; VI-NEXT: v_add_f16_e32 v20, s23, v1 ; VI-NEXT: v_add_f16_e32 v7, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v29, v25, v2 +; VI-NEXT: v_or_b32_e32 v4, v20, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v25, s22, v1 +; VI-NEXT: v_add_f16_e32 v16, s4, v1 +; VI-NEXT: v_or_b32_e32 v3, v25, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; VI-NEXT: v_add_f16_e32 v21, s25, v1 -; VI-NEXT: v_add_f16_e32 v16, s4, v1 -; VI-NEXT: v_or_b32_e32 v49, v21, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v16 ; VI-NEXT: v_add_f16_e32 v26, s24, v1 -; VI-NEXT: v_or_b32_e32 v48, v26, v2 -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[48:49] -; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[29:30] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[35:36] -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[38:39] -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v49 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v48 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v38 -; VI-NEXT: v_bfe_u32 v6, v7, 8, 8 -; VI-NEXT: v_bfe_u32 v29, v11, 8, 8 -; VI-NEXT: v_bfe_u32 v32, v10, 8, 8 -; VI-NEXT: v_bfe_u32 v35, v9, 8, 8 -; VI-NEXT: v_bfe_u32 v38, v8, 8, 8 +; VI-NEXT: v_or_b32_e32 v2, v21, v2 +; VI-NEXT: v_or_b32_e32 v1, v26, v27 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[50:51] +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v50 +; VI-NEXT: v_bfe_u32 v27, v7, 8, 8 +; VI-NEXT: v_bfe_u32 v30, v11, 8, 8 +; VI-NEXT: v_bfe_u32 v33, v10, 8, 8 +; VI-NEXT: v_bfe_u32 v36, v9, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v8, 8, 8 ; VI-NEXT: s_branch .LBB61_5 ; VI-NEXT: .LBB61_3: ; VI-NEXT: ; implicit-def: $sgpr56 @@ -23307,91 +23327,91 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v20, s23 ; VI-NEXT: v_mov_b32_e32 v26, s24 ; VI-NEXT: v_mov_b32_e32 v21, s25 -; VI-NEXT: v_mov_b32_e32 v38, s58 -; VI-NEXT: v_mov_b32_e32 v35, s57 -; VI-NEXT: v_mov_b32_e32 v32, s46 -; VI-NEXT: v_mov_b32_e32 v29, s43 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v48, s56 -; VI-NEXT: v_mov_b32_e32 v39, s47 -; VI-NEXT: v_mov_b32_e32 v37, s45 -; VI-NEXT: v_mov_b32_e32 v36, s44 -; VI-NEXT: v_mov_b32_e32 v34, s42 -; VI-NEXT: v_mov_b32_e32 v33, s40 -; VI-NEXT: v_mov_b32_e32 v31, s29 -; VI-NEXT: v_mov_b32_e32 v30, s28 -; VI-NEXT: v_mov_b32_e32 v28, s27 -; VI-NEXT: v_mov_b32_e32 v27, s26 +; VI-NEXT: v_mov_b32_e32 v39, s58 +; VI-NEXT: v_mov_b32_e32 v36, s57 +; VI-NEXT: v_mov_b32_e32 v33, s46 +; VI-NEXT: v_mov_b32_e32 v30, s43 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v49, s56 +; VI-NEXT: v_mov_b32_e32 v48, s47 +; VI-NEXT: v_mov_b32_e32 v38, s45 +; VI-NEXT: v_mov_b32_e32 v37, s44 +; VI-NEXT: v_mov_b32_e32 v35, s42 +; VI-NEXT: v_mov_b32_e32 v34, s40 +; VI-NEXT: v_mov_b32_e32 v32, s29 +; VI-NEXT: v_mov_b32_e32 v31, s28 +; VI-NEXT: v_mov_b32_e32 v29, s27 +; VI-NEXT: v_mov_b32_e32 v28, s26 ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v3, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v1, s12 ; VI-NEXT: .LBB61_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v49 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; VI-NEXT: v_or_b32_sdwa v22, v22, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v12, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v39 ; VI-NEXT: v_or_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v38 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; VI-NEXT: v_or_b32_sdwa v5, v23, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v36 ; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 ; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v30 ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -23458,17 +23478,17 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 @@ -23477,8 +23497,8 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB61_5 ; GFX9-NEXT: .LBB61_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -23514,17 +23534,12 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: s_branch .LBB61_2 ; GFX9-NEXT: .LBB61_4: ; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-NEXT: v_mov_b32_e32 v7, s18 -; GFX9-NEXT: v_mov_b32_e32 v8, s19 ; GFX9-NEXT: v_mov_b32_e32 v5, s20 -; GFX9-NEXT: v_mov_b32_e32 v6, s21 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -23533,17 +23548,17 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v33, s62 ; GFX9-NEXT: v_mov_b32_e32 v31, s60 ; GFX9-NEXT: v_mov_b32_e32 v32, s59 -; GFX9-NEXT: v_mov_b32_e32 v30, s58 -; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v29, s58 +; GFX9-NEXT: v_mov_b32_e32 v30, s56 ; GFX9-NEXT: v_mov_b32_e32 v28, s57 ; GFX9-NEXT: v_mov_b32_e32 v26, s47 ; GFX9-NEXT: v_mov_b32_e32 v27, s46 ; GFX9-NEXT: v_mov_b32_e32 v25, s45 ; GFX9-NEXT: v_mov_b32_e32 v24, s43 ; GFX9-NEXT: v_mov_b32_e32 v23, s44 -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v22, s41 -; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v22, s40 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v16, s27 @@ -23553,11 +23568,16 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v13, s8 ; GFX9-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 ; GFX9-NEXT: .LBB61_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -23578,10 +23598,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 @@ -23598,11 +23618,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -25152,19 +25172,19 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s45, v14 ; SI-NEXT: v_readfirstlane_b32 s42, v13 ; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_readfirstlane_b32 s41, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v0 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -25195,28 +25215,28 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_and_b32 s4, s43, 0xff @@ -25249,34 +25269,38 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_or_b32 s8, s8, s11 -; SI-NEXT: s_and_b32 s11, s40, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_or_b32 s6, s6, s11 -; SI-NEXT: s_and_b32 s11, s13, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s12, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s28, 0xff -; SI-NEXT: s_lshl_b32 s11, s29, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s63, s63, 3 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s26, 0xff -; SI-NEXT: s_lshl_b32 s13, s27, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_and_b32 s10, s26, 0xff +; SI-NEXT: s_lshl_b32 s12, s27, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s24, 0xff +; SI-NEXT: s_lshl_b32 s13, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_and_b32 s4, s63, 0xff ; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_add_i32 s61, s61, 3 @@ -25286,11 +25310,10 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s24, 0xff -; SI-NEXT: s_lshl_b32 s14, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s22, 0xff +; SI-NEXT: s_lshl_b32 s15, s23, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -25308,12 +25331,10 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s43, s43, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 8 ; SI-NEXT: s_and_b32 s41, s41, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s22, 0xff -; SI-NEXT: s_lshl_b32 s22, s23, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s20, 0xff +; SI-NEXT: s_lshl_b32 s20, s21, 8 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff @@ -25324,9 +25345,8 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s46, s46, s47 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s15, s15, s41 -; SI-NEXT: s_or_b32 s14, s22, s14 -; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: s_or_b32 s15, s20, s15 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -25336,32 +25356,32 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_addk_i32 s46, 0x300 ; SI-NEXT: s_addk_i32 s44, 0x300 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s40, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 @@ -25446,33 +25466,33 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v3, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -25606,58 +25626,58 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v5, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s29, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v7, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v9, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -25991,27 +26011,27 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26023,92 +26043,92 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v21, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -26116,11 +26136,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -26128,10 +26148,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -26308,41 +26328,41 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 @@ -26409,45 +26429,45 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; VI-NEXT: s_lshr_b32 s4, s24, 16 ; VI-NEXT: v_or_b32_e32 v9, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s24, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v3, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v10, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v10, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v13, s18, v0 ; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v11, v1 +; VI-NEXT: v_add_f16_sdwa v1, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v11, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_or_b32_e32 v2, v13, v1 +; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -26573,17 +26593,18 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v5f64_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -26598,41 +26619,41 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 @@ -26643,10 +26664,10 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 @@ -26657,33 +26678,33 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v0, v20 ; SI-NEXT: v_mov_b32_e32 v1, v29 -; SI-NEXT: v_mov_b32_e32 v2, v27 -; SI-NEXT: v_mov_b32_e32 v3, v26 -; SI-NEXT: v_mov_b32_e32 v4, v25 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v6, v24 -; SI-NEXT: v_mov_b32_e32 v7, v21 -; SI-NEXT: v_mov_b32_e32 v8, v22 -; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: v_mov_b32_e32 v2, v24 +; SI-NEXT: v_mov_b32_e32 v3, v28 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v9, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: @@ -26797,40 +26818,40 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -26979,27 +27000,27 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27011,92 +27032,92 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v21, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: .LBB68_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27104,11 +27125,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27116,10 +27137,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -27296,41 +27317,41 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 @@ -27397,45 +27418,45 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s4, s24, 16 ; VI-NEXT: v_or_b32_e32 v9, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s24, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v3, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v10, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v10, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v13, s18, v0 ; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v11, v1 +; VI-NEXT: v_add_f16_sdwa v1, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v11, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_or_b32_e32 v2, v13, v1 +; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -27561,27 +27582,23 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v5i64_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v9 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v29, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -27594,84 +27611,79 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_4 -; SI-NEXT: .LBB70_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB70_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_2 -; SI-NEXT: .LBB70_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v27, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v25, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v23, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -27680,12 +27692,18 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v3, v28 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v9, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: @@ -29360,81 +29378,81 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v29 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v33 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: v_or_b32_e32 v6, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s28, 0xff ; SI-NEXT: s_lshl_b32 s8, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 ; SI-NEXT: v_or_b32_e32 v3, s7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v5, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_or_b32_e32 v9, v1, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -29618,33 +29636,33 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v1, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v3, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -29802,33 +29820,33 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v3, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -30422,46 +30440,46 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -30485,7 +30503,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_4 @@ -30495,12 +30513,12 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -30524,14 +30542,14 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -30541,7 +30559,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30555,7 +30573,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30569,7 +30587,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30583,14 +30601,14 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30603,46 +30621,46 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -30666,7 +30684,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB74_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_4 @@ -30676,12 +30694,12 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; GFX9-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; GFX9-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -30705,14 +30723,14 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB74_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -30721,7 +30739,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30733,7 +30751,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30745,7 +30763,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30757,13 +30775,13 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31187,8 +31205,8 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s18 ; SI-NEXT: v_alignbit_b32 v8, s19, v1, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v1, 16 -; SI-NEXT: v_alignbit_b32 v17, s19, v1, 8 +; SI-NEXT: v_alignbit_b32 v17, s19, v1, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s16 ; SI-NEXT: v_alignbit_b32 v18, s17, v1, 24 ; SI-NEXT: v_alignbit_b32 v19, s17, v1, 16 @@ -31230,6 +31248,7 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v15, s21, v5, 16 ; SI-NEXT: v_alignbit_b32 v16, s21, v5, 8 ; SI-NEXT: v_alignbit_b32 v8, s19, v7, 24 +; SI-NEXT: v_alignbit_b32 v17, s19, v7, 16 ; SI-NEXT: s_lshr_b32 s6, s25, 24 ; SI-NEXT: s_lshr_b32 s7, s25, 16 ; SI-NEXT: s_lshr_b32 s8, s25, 8 @@ -31245,8 +31264,7 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s28, s17, 24 ; SI-NEXT: s_lshr_b32 s29, s17, 16 ; SI-NEXT: s_lshr_b32 s40, s17, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v7, 16 -; SI-NEXT: v_alignbit_b32 v17, s19, v7, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v7, 8 ; SI-NEXT: v_alignbit_b32 v18, s17, v9, 24 ; SI-NEXT: v_alignbit_b32 v19, s17, v9, 16 ; SI-NEXT: v_alignbit_b32 v20, s17, v9, 8 @@ -31258,8 +31276,8 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr26 @@ -31292,46 +31310,46 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: .LBB75_5: ; %end ; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 ; SI-NEXT: v_or_b32_e32 v9, v9, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s28, 24 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v18, s4 -; SI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v9, v18 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_mov_b32_e32 v18, s4 ; SI-NEXT: s_and_b32 s4, s19, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v17 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 @@ -31443,107 +31461,107 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s17, 8 ; VI-NEXT: s_lshr_b32 s76, s16, 16 ; VI-NEXT: s_lshr_b32 s75, s16, 8 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB75_4 ; VI-NEXT: .LBB75_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 ; VI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f64 v[7:8], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] -; VI-NEXT: v_readfirstlane_b32 s17, v10 -; VI-NEXT: v_readfirstlane_b32 s19, v8 -; VI-NEXT: v_readfirstlane_b32 s21, v6 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; VI-NEXT: v_readfirstlane_b32 s17, v12 +; VI-NEXT: v_readfirstlane_b32 s19, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v8 ; VI-NEXT: v_readfirstlane_b32 s23, v4 ; VI-NEXT: v_readfirstlane_b32 s25, v2 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: s_lshr_b32 s26, s25, 24 ; VI-NEXT: s_lshr_b32 s27, s25, 16 ; VI-NEXT: s_lshr_b32 s28, s25, 8 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 ; VI-NEXT: s_lshr_b32 s29, s23, 24 ; VI-NEXT: s_lshr_b32 s40, s23, 16 ; VI-NEXT: s_lshr_b32 s41, s23, 8 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v3 ; VI-NEXT: s_lshr_b32 s42, s21, 24 ; VI-NEXT: s_lshr_b32 s43, s21, 16 ; VI-NEXT: s_lshr_b32 s44, s21, 8 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v5 ; VI-NEXT: s_lshr_b32 s45, s19, 24 ; VI-NEXT: s_lshr_b32 s46, s19, 16 ; VI-NEXT: s_lshr_b32 s47, s19, 8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v7 ; VI-NEXT: s_lshr_b32 s56, s17, 24 ; VI-NEXT: s_lshr_b32 s57, s17, 16 ; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; VI-NEXT: s_branch .LBB75_5 ; VI-NEXT: .LBB75_3: ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr45 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr29 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB75_2 ; VI-NEXT: .LBB75_4: -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: v_mov_b32_e32 v7, s18 -; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v7, s20 ; VI-NEXT: v_mov_b32_e32 v3, s22 ; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: v_mov_b32_e32 v16, s12 ; VI-NEXT: v_mov_b32_e32 v25, s76 -; VI-NEXT: v_mov_b32_e32 v26, s75 +; VI-NEXT: v_mov_b32_e32 v2, s75 ; VI-NEXT: v_mov_b32_e32 v23, s74 ; VI-NEXT: v_mov_b32_e32 v24, s73 ; VI-NEXT: v_mov_b32_e32 v21, s72 ; VI-NEXT: v_mov_b32_e32 v22, s63 ; VI-NEXT: v_mov_b32_e32 v19, s62 ; VI-NEXT: v_mov_b32_e32 v20, s61 -; VI-NEXT: v_mov_b32_e32 v17, s60 +; VI-NEXT: v_mov_b32_e32 v6, s60 ; VI-NEXT: v_mov_b32_e32 v18, s59 ; VI-NEXT: v_mov_b32_e32 v15, s4 ; VI-NEXT: v_mov_b32_e32 v14, s6 ; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v12, s10 -; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: .LBB75_5: ; %end ; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s5, s58, 8 @@ -31551,46 +31569,46 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s5, s57, 0xff ; VI-NEXT: s_lshl_b32 s6, s56, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s5, s47, 8 -; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s46, 0xff ; VI-NEXT: s_lshl_b32 s6, s45, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v15 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: s_and_b32 s4, s21, 0xff ; VI-NEXT: s_lshl_b32 s5, s44, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s43, 0xff ; VI-NEXT: s_lshl_b32 s6, s42, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; VI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -31608,7 +31626,7 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s5, s40, 0xff ; VI-NEXT: s_lshl_b32 s6, s29, 8 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -31627,9 +31645,9 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s5, s27, 0xff ; VI-NEXT: s_lshl_b32 s6, s26, 8 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31673,121 +31691,121 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s17, 8 ; GFX9-NEXT: s_lshr_b32 s76, s16, 16 ; GFX9-NEXT: s_lshr_b32 s75, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB75_4 ; GFX9-NEXT: .LBB75_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f64 v[7:8], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] -; GFX9-NEXT: v_readfirstlane_b32 s17, v10 -; GFX9-NEXT: v_readfirstlane_b32 s19, v8 -; GFX9-NEXT: v_readfirstlane_b32 s21, v6 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX9-NEXT: v_readfirstlane_b32 s17, v12 +; GFX9-NEXT: v_readfirstlane_b32 s19, v10 +; GFX9-NEXT: v_readfirstlane_b32 s21, v8 ; GFX9-NEXT: v_readfirstlane_b32 s23, v4 ; GFX9-NEXT: v_readfirstlane_b32 s25, v2 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: s_lshr_b32 s26, s25, 24 ; GFX9-NEXT: s_lshr_b32 s27, s25, 16 ; GFX9-NEXT: s_lshr_b32 s28, s25, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v1 ; GFX9-NEXT: s_lshr_b32 s29, s23, 24 ; GFX9-NEXT: s_lshr_b32 s40, s23, 16 ; GFX9-NEXT: s_lshr_b32 s41, s23, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v3 ; GFX9-NEXT: s_lshr_b32 s42, s21, 24 ; GFX9-NEXT: s_lshr_b32 s43, s21, 16 ; GFX9-NEXT: s_lshr_b32 s44, s21, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v5 ; GFX9-NEXT: s_lshr_b32 s45, s19, 24 ; GFX9-NEXT: s_lshr_b32 s46, s19, 16 ; GFX9-NEXT: s_lshr_b32 s47, s19, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v7 ; GFX9-NEXT: s_lshr_b32 s56, s17, 24 ; GFX9-NEXT: s_lshr_b32 s57, s17, 16 ; GFX9-NEXT: s_lshr_b32 s58, s17, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; GFX9-NEXT: s_branch .LBB75_5 ; GFX9-NEXT: .LBB75_3: ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr12 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr57 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr4 ; GFX9-NEXT: ; implicit-def: $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr45 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: ; implicit-def: $sgpr44 ; GFX9-NEXT: ; implicit-def: $sgpr43 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 ; GFX9-NEXT: ; implicit-def: $sgpr29 ; GFX9-NEXT: ; implicit-def: $sgpr59 ; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr10 ; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB75_2 ; GFX9-NEXT: .LBB75_4: -; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: v_mov_b32_e32 v7, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s20 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v16, s12 ; GFX9-NEXT: v_mov_b32_e32 v25, s76 -; GFX9-NEXT: v_mov_b32_e32 v26, s75 +; GFX9-NEXT: v_mov_b32_e32 v2, s75 ; GFX9-NEXT: v_mov_b32_e32 v23, s74 ; GFX9-NEXT: v_mov_b32_e32 v24, s73 ; GFX9-NEXT: v_mov_b32_e32 v21, s72 ; GFX9-NEXT: v_mov_b32_e32 v22, s63 ; GFX9-NEXT: v_mov_b32_e32 v19, s62 ; GFX9-NEXT: v_mov_b32_e32 v20, s61 -; GFX9-NEXT: v_mov_b32_e32 v17, s60 +; GFX9-NEXT: v_mov_b32_e32 v6, s60 ; GFX9-NEXT: v_mov_b32_e32 v18, s59 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-NEXT: v_mov_b32_e32 v13, s8 -; GFX9-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 ; GFX9-NEXT: .LBB75_5: ; %end ; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s58, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s57, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s56, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v16 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -31796,28 +31814,28 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s46, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s45, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v24 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v15 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: s_and_b32 s4, s21, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s44, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s43, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s42, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v14 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 @@ -31833,7 +31851,7 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s5, s40, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v13 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -31850,9 +31868,9 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s5, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s26, 8 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33471,81 +33489,81 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v29 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v33 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: v_or_b32_e32 v6, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s28, 0xff ; SI-NEXT: s_lshl_b32 s8, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 ; SI-NEXT: v_or_b32_e32 v3, s7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v5, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_or_b32_e32 v9, v1, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -33729,33 +33747,33 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_or_b32_sdwa v1, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_or_b32_sdwa v3, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -33913,33 +33931,33 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v3, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v8, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -34538,46 +34556,46 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -34601,7 +34619,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB78_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB78_4 @@ -34616,12 +34634,12 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -34645,14 +34663,14 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; VI-NEXT: .LBB78_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -34662,7 +34680,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34676,7 +34694,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34690,7 +34708,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34704,14 +34722,14 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34724,46 +34742,46 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr12 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB78_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -34787,7 +34805,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB78_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB78_4 @@ -34802,12 +34820,12 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 3, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 @@ -34831,14 +34849,14 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX9-NEXT: .LBB78_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -34847,7 +34865,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34859,7 +34877,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34871,7 +34889,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34883,13 +34901,13 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 6fc9a35cd9ee6..a3c2835ef661e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -549,6 +549,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v11i32_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v11 ; SI-NEXT: v_mov_b32_e32 v20, v10 ; SI-NEXT: v_mov_b32_e32 v18, v9 ; SI-NEXT: v_mov_b32_e32 v16, v8 @@ -559,7 +560,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v6, v3 ; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -918,6 +919,11 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v28, v10 ; SI-NEXT: v_mov_b32_e32 v27, v8 ; SI-NEXT: v_mov_b32_e32 v26, v6 @@ -925,11 +931,11 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v24, v2 ; SI-NEXT: v_mov_b32_e32 v23, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 @@ -959,37 +965,37 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 @@ -1184,21 +1190,21 @@ define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_or_b32_e32 v9, v1, v16 +; SI-NEXT: v_or_b32_e32 v10, v2, v15 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -1444,26 +1450,23 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v11i32_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v31, v9 -; SI-NEXT: v_mov_b32_e32 v30, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v5 -; SI-NEXT: v_mov_b32_e32 v26, v4 -; SI-NEXT: v_mov_b32_e32 v25, v3 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -1480,106 +1483,106 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v5, v30 +; SI-NEXT: v_mov_b32_e32 v7, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: @@ -1737,13 +1740,14 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 @@ -1755,8 +1759,7 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB9_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: @@ -1907,29 +1910,40 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_mov_b32_e32 v23, v10 +; SI-NEXT: v_mov_b32_e32 v24, v9 +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v27, v6 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v30, v3 +; SI-NEXT: v_mov_b32_e32 v31, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1941,57 +1955,57 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1999,10 +2013,10 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2010,11 +2024,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2022,11 +2036,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2034,11 +2048,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2046,7 +2060,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 @@ -2207,7 +2221,8 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 @@ -2216,8 +2231,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2228,7 +2242,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 @@ -2238,8 +2252,8 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v2, v27, v2 ; SI-NEXT: v_or_b32_e32 v3, v25, v3 ; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 ; SI-NEXT: v_or_b32_e32 v7, v17, v7 ; SI-NEXT: v_or_b32_e32 v8, v15, v8 ; SI-NEXT: v_or_b32_e32 v9, v13, v9 @@ -2284,7 +2298,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2292,10 +2306,10 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -2515,6 +2529,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v11f32_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v11 ; SI-NEXT: v_mov_b32_e32 v20, v10 ; SI-NEXT: v_mov_b32_e32 v18, v9 ; SI-NEXT: v_mov_b32_e32 v16, v8 @@ -2525,7 +2540,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v6, v3 ; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -2891,6 +2906,11 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v28, v10 ; SI-NEXT: v_mov_b32_e32 v27, v8 ; SI-NEXT: v_mov_b32_e32 v26, v6 @@ -2898,11 +2918,11 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v24, v2 ; SI-NEXT: v_mov_b32_e32 v23, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 @@ -2932,37 +2952,37 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 @@ -3157,21 +3177,21 @@ define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_or_b32_e32 v9, v1, v16 +; SI-NEXT: v_or_b32_e32 v10, v2, v15 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -3417,26 +3437,23 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v11f32_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v31, v9 -; SI-NEXT: v_mov_b32_e32 v30, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v5 -; SI-NEXT: v_mov_b32_e32 v26, v4 -; SI-NEXT: v_mov_b32_e32 v25, v3 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -3453,106 +3470,106 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v5, v30 +; SI-NEXT: v_mov_b32_e32 v7, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: @@ -3680,46 +3697,46 @@ define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s26, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s25, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3892,29 +3909,40 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_mov_b32_e32 v23, v10 +; SI-NEXT: v_mov_b32_e32 v24, v9 +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v27, v6 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v30, v3 +; SI-NEXT: v_mov_b32_e32 v31, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3926,57 +3954,57 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3984,10 +4012,10 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3995,11 +4023,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4007,11 +4035,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4019,11 +4047,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4031,7 +4059,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 @@ -4192,7 +4220,8 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 @@ -4201,8 +4230,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 @@ -4213,7 +4241,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 @@ -4223,8 +4251,8 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, v27, v2 ; SI-NEXT: v_or_b32_e32 v3, v25, v3 ; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 ; SI-NEXT: v_or_b32_e32 v7, v17, v7 ; SI-NEXT: v_or_b32_e32 v8, v15, v8 ; SI-NEXT: v_or_b32_e32 v9, v13, v9 @@ -4269,7 +4297,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4277,10 +4305,10 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -4659,11 +4687,6 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 3 -; VI-NEXT: v_add_u16_sdwa v19, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v10, 3, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_add_u16_sdwa v19, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 3, v9 ; VI-NEXT: v_add_u16_sdwa v12, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v13, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v14, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4671,9 +4694,12 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v16, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v17, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v19 ; VI-NEXT: v_add_u16_sdwa v19, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v11, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_add_u16_sdwa v21, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v11, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 @@ -4683,7 +4709,9 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_or_b32_e32 v10, v10, v20 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_or_b32_e32 v8, v8, v21 ; VI-NEXT: v_or_b32_e32 v7, v7, v19 ; VI-NEXT: v_or_b32_e32 v6, v6, v18 ; VI-NEXT: v_or_b32_e32 v5, v5, v17 @@ -4766,14 +4794,14 @@ define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v26, v7 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_mov_b32_e32 v24, v5 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v3 -; SI-NEXT: v_mov_b32_e32 v29, v2 -; SI-NEXT: v_mov_b32_e32 v28, v1 -; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v29, v3 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v27, v1 +; SI-NEXT: v_mov_b32_e32 v26, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -4791,24 +4819,24 @@ define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v26 ; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 @@ -5186,31 +5214,31 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 -; VI-NEXT: v_add_f16_sdwa v10, v10, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 -; VI-NEXT: v_add_f16_sdwa v9, v9, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v11, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v14, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v6, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v5, v5, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v6, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v7, v7, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v12 +; VI-NEXT: v_add_f16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v12, v9, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v20, v10 +; VI-NEXT: v_or_b32_e32 v9, v9, v12 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 ; VI-NEXT: v_or_b32_e32 v7, v19, v7 ; VI-NEXT: v_or_b32_e32 v6, v18, v6 ; VI-NEXT: v_or_b32_e32 v5, v17, v5 @@ -5293,15 +5321,15 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v7 -; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 @@ -5314,15 +5342,16 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 @@ -5435,62 +5464,62 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s26, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s26, 16 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_add_f16_e32 v3, s26, v0 ; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: s_lshr_b32 s4, s24, 16 ; VI-NEXT: v_or_b32_e32 v10, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 ; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v9, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v8, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v6, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v5, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_add_f16_e32 v14, s18, v0 ; VI-NEXT: v_add_f16_e32 v11, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_or_b32_e32 v0, v11, v12 +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: v_or_b32_e32 v2, v14, v2 +; VI-NEXT: v_add_f16_sdwa v14, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v12, v0 +; VI-NEXT: v_or_b32_e32 v0, v11, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index c9860dbb7d72c..719347108e67e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -2080,38 +2080,38 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v34 -; SI-NEXT: v_or_b32_e32 v5, v5, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -2143,8 +2143,8 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 @@ -2313,14 +2313,16 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -2330,11 +2332,9 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v10, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v11, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -2594,31 +2594,27 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v12i32_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v21, v12 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v32, v8 ; SI-NEXT: v_mov_b32_e32 v31, v7 ; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 ; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -2633,114 +2629,115 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v9, v26 +; SI-NEXT: v_mov_b32_e32 v11, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: @@ -3085,31 +3082,31 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3121,104 +3118,104 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v33, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v27, v7 -; SI-NEXT: v_or_b32_e32 v8, v25, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v6, v28, v6 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3226,11 +3223,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3238,7 +3235,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 @@ -3401,8 +3398,9 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 @@ -3413,9 +3411,8 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -3425,9 +3422,9 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 @@ -3437,8 +3434,8 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: v_or_b32_e32 v6, v22, v6 ; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_or_b32_e32 v8, v18, v8 @@ -3476,23 +3473,23 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -5275,38 +5272,38 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v34 -; SI-NEXT: v_or_b32_e32 v5, v5, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -5338,8 +5335,8 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 @@ -5508,14 +5505,16 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -5525,11 +5524,9 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v10, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v11, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5789,31 +5786,27 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v12f32_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v21, v12 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v32, v8 ; SI-NEXT: v_mov_b32_e32 v31, v7 ; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 ; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -5828,114 +5821,115 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v31 +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v9, v26 +; SI-NEXT: v_mov_b32_e32 v11, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: @@ -6289,31 +6283,31 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6325,104 +6319,104 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v33, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v27, v7 -; SI-NEXT: v_or_b32_e32 v8, v25, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v6, v28, v6 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6430,11 +6424,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -6442,7 +6436,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 @@ -6605,8 +6599,9 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 @@ -6617,9 +6612,8 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -6629,9 +6623,9 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 @@ -6641,8 +6635,8 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 ; SI-NEXT: v_or_b32_e32 v6, v22, v6 ; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_or_b32_e32 v8, v18, v8 @@ -6680,23 +6674,23 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -7888,38 +7882,38 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v34 -; SI-NEXT: v_or_b32_e32 v8, v8, v33 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -7951,8 +7945,8 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 @@ -8121,14 +8115,16 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_or_b32_e32 v8, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -8138,11 +8134,9 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v10, v0, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 ; SI-NEXT: v_or_b32_e32 v11, v0, v21 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -8408,19 +8402,20 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v6f64_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_mov_b32_e32 v21, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -8437,47 +8432,47 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v1 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -8490,11 +8485,11 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 @@ -8507,38 +8502,38 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, v24 ; SI-NEXT: v_mov_b32_e32 v1, v35 -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_mov_b32_e32 v3, v32 -; SI-NEXT: v_mov_b32_e32 v4, v31 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: v_mov_b32_e32 v6, v30 -; SI-NEXT: v_mov_b32_e32 v7, v27 -; SI-NEXT: v_mov_b32_e32 v8, v28 -; SI-NEXT: v_mov_b32_e32 v9, v24 -; SI-NEXT: v_mov_b32_e32 v10, v26 -; SI-NEXT: v_mov_b32_e32 v11, v25 +; SI-NEXT: v_mov_b32_e32 v2, v29 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v4, v25 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v8, v27 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: v_mov_b32_e32 v11, v30 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: @@ -8661,47 +8656,47 @@ define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[26:27], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -8857,31 +8852,31 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8893,104 +8888,104 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v31, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v9, v25, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -8998,11 +8993,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9010,11 +9005,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9168,13 +9163,14 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 @@ -9185,9 +9181,8 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -9195,22 +9190,22 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v28, v4 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: v_or_b32_e32 v7, v24, v7 ; SI-NEXT: v_or_b32_e32 v8, v22, v8 @@ -9220,51 +9215,51 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -9337,55 +9332,55 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; VI-NEXT: s_lshr_b32 s4, s26, 16 ; VI-NEXT: v_or_b32_e32 v11, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s26, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v4, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v6, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v3, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v12, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v12, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: v_add_f16_e32 v13, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v14, s18, v0 +; VI-NEXT: v_add_f16_e32 v15, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v13, v1 +; VI-NEXT: v_add_f16_sdwa v1, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v12, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_or_b32_e32 v2, v14, v1 +; VI-NEXT: v_or_b32_e32 v1, v15, v12 ; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -9948,38 +9943,38 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v34 -; SI-NEXT: v_or_b32_e32 v8, v8, v33 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -10011,8 +10006,8 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 @@ -10181,14 +10176,16 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_or_b32_e32 v8, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -10198,11 +10195,9 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v10, v0, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 ; SI-NEXT: v_or_b32_e32 v11, v0, v21 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -10468,31 +10463,26 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v6i64_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v31, v5 -; SI-NEXT: v_mov_b32_e32 v30, v4 -; SI-NEXT: v_mov_b32_e32 v33, v3 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_mov_b32_e32 v34, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v21, v12 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v24, v8 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -10507,98 +10497,93 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_4 -; SI-NEXT: .LBB52_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB52_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 -; SI-NEXT: .LBB52_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v31, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v29, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v26 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v27, vcc -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -10608,13 +10593,20 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v9, v30 +; SI-NEXT: v_mov_b32_e32 v11, v28 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: @@ -10962,31 +10954,31 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10998,104 +10990,104 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v31, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v9, v25, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -11103,11 +11095,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -11115,11 +11107,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -11273,13 +11265,14 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 @@ -11290,9 +11283,8 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -11300,22 +11292,22 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v28, v4 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: v_or_b32_e32 v7, v24, v7 ; SI-NEXT: v_or_b32_e32 v8, v22, v8 @@ -11325,51 +11317,51 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -11442,55 +11434,55 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s4, s26, 16 ; VI-NEXT: v_or_b32_e32 v11, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s26, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v4, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v6, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v3, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v12, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v12, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: v_add_f16_e32 v13, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v14, s18, v0 +; VI-NEXT: v_add_f16_e32 v15, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v13, v1 +; VI-NEXT: v_add_f16_sdwa v1, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v12, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_or_b32_e32 v2, v14, v1 +; VI-NEXT: v_or_b32_e32 v1, v15, v12 ; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -11620,32 +11612,27 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v23 -; SI-NEXT: v_mov_b32_e32 v54, v22 -; SI-NEXT: v_mov_b32_e32 v53, v21 -; SI-NEXT: v_mov_b32_e32 v52, v20 -; SI-NEXT: v_mov_b32_e32 v51, v19 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_mov_b32_e32 v49, v17 -; SI-NEXT: v_mov_b32_e32 v48, v16 -; SI-NEXT: v_mov_b32_e32 v39, v15 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v13 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v31, v7 -; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 -; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v0 +; SI-NEXT: v_mov_b32_e32 v55, v20 +; SI-NEXT: v_mov_b32_e32 v54, v19 +; SI-NEXT: v_mov_b32_e32 v53, v18 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -11668,40 +11655,34 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -11723,34 +11704,40 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -11772,13 +11759,14 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v22, v25 +; SI-NEXT: v_mov_b32_e32 v23, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: @@ -11791,26 +11779,24 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 -; VI-NEXT: v_add_u16_sdwa v19, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v11, 3, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v19 -; VI-NEXT: v_add_u16_sdwa v19, v10, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 3, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_add_u16_sdwa v19, v9, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 3, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_add_u16_sdwa v19, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_add_u16_sdwa v13, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v14, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v15, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v16, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v17, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v5, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v19 ; VI-NEXT: v_add_u16_sdwa v19, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v12, v7, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v20 +; VI-NEXT: v_add_u16_sdwa v20, v9, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v21 +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_add_u16_sdwa v21, v7, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_add_u16_e32 v5, 3, v5 @@ -11819,7 +11805,9 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v7, v7, v12 +; VI-NEXT: v_or_b32_e32 v9, v9, v20 +; VI-NEXT: v_or_b32_e32 v8, v8, v12 +; VI-NEXT: v_or_b32_e32 v7, v7, v21 ; VI-NEXT: v_or_b32_e32 v6, v6, v19 ; VI-NEXT: v_or_b32_e32 v5, v5, v18 ; VI-NEXT: v_or_b32_e32 v4, v4, v17 @@ -12237,63 +12225,59 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -12305,39 +12289,43 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v24, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12352,35 +12340,35 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 ; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 -; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v12, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v14, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v13, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v13, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_or_b32_e32 v8, v8, v13 +; VI-NEXT: v_or_b32_e32 v7, v21, v7 ; VI-NEXT: v_or_b32_e32 v6, v19, v6 ; VI-NEXT: v_or_b32_e32 v5, v18, v5 ; VI-NEXT: v_or_b32_e32 v4, v17, v4 @@ -12464,9 +12452,9 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mov_b32_e32 v12, v8 -; SI-NEXT: v_mov_b32_e32 v13, v7 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 @@ -12474,7 +12462,7 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 @@ -12485,6 +12473,10 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -12492,76 +12484,69 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -12573,39 +12558,43 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v24, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: @@ -12619,67 +12608,67 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_4 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s27, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s27, 16 -; VI-NEXT: v_add_f16_e32 v1, s26, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_add_f16_e32 v3, s27, v0 ; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: s_lshr_b32 s4, s25, 16 ; VI-NEXT: v_or_b32_e32 v11, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 ; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v10, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v9, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v8, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v6, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v5, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v15, s18, v0 ; VI-NEXT: v_add_f16_e32 v12, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_or_b32_e32 v0, v12, v13 +; VI-NEXT: v_add_f16_e32 v13, s17, v0 +; VI-NEXT: v_or_b32_e32 v2, v15, v2 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v13, v0 +; VI-NEXT: v_or_b32_e32 v0, v12, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_3: ; VI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index eaf314d4b65dc..5184d2af767f2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2243,30 +2243,38 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v37, v15 ; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 ; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 ; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 ; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 ; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 ; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v51, v3 ; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB14_3 @@ -2292,47 +2300,47 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true @@ -2375,10 +2383,10 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -2547,14 +2555,16 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff @@ -2574,11 +2584,9 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -3421,108 +3429,122 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v3 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v10, v20, v10 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3530,10 +3552,10 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3541,11 +3563,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3553,11 +3575,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -3565,11 +3587,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -3577,11 +3599,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3589,11 +3611,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3601,11 +3623,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3613,29 +3635,27 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3774,9 +3794,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 @@ -3791,11 +3812,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 @@ -3805,10 +3825,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 @@ -3818,10 +3838,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 @@ -3836,7 +3856,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3847,7 +3867,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3855,11 +3875,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3867,10 +3887,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -3968,65 +3988,65 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s4, s28, 16 ; VI-NEXT: v_or_b32_e32 v13, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: s_lshr_b32 s4, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_add_f16_e32 v4, s27, v0 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_add_f16_e32 v6, s26, v0 +; VI-NEXT: v_or_b32_e32 v11, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v5, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v6, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v14, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v14, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v17, s18, v0 ; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v15, v1 +; VI-NEXT: v_add_f16_sdwa v1, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v15, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_or_b32_e32 v2, v17, v1 +; VI-NEXT: v_or_b32_e32 v1, v14, v15 ; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -5787,30 +5807,38 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v37, v15 ; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 ; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 ; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 ; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 ; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 ; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v51, v3 ; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB30_3 @@ -5836,47 +5864,47 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true @@ -5919,10 +5947,10 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -6091,14 +6119,16 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff @@ -6118,11 +6148,9 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -6969,108 +6997,122 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v3 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v10, v20, v10 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7078,10 +7120,10 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7089,11 +7131,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7101,11 +7143,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -7113,11 +7155,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -7125,11 +7167,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7137,11 +7179,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7149,11 +7191,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7161,29 +7203,27 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7322,9 +7362,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 @@ -7339,11 +7380,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 @@ -7353,10 +7393,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 @@ -7366,10 +7406,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 @@ -7384,7 +7424,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7395,7 +7435,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7403,11 +7443,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7415,10 +7455,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -7516,65 +7556,65 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; VI-NEXT: s_lshr_b32 s4, s28, 16 ; VI-NEXT: v_or_b32_e32 v13, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: s_lshr_b32 s4, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_add_f16_e32 v4, s27, v0 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_add_f16_e32 v6, s26, v0 +; VI-NEXT: v_or_b32_e32 v11, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v5, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v6, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v14, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v14, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v17, s18, v0 ; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v15, v1 +; VI-NEXT: v_add_f16_sdwa v1, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v15, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_or_b32_e32 v2, v17, v1 +; VI-NEXT: v_or_b32_e32 v1, v14, v15 ; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -8714,30 +8754,38 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v37, v15 ; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 ; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 ; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 ; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 ; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 ; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v51, v3 ; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB42_3 @@ -8763,47 +8811,47 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true @@ -8846,10 +8894,10 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -9018,14 +9066,16 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff @@ -9045,11 +9095,9 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -9896,108 +9944,122 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v3 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v10, v20, v10 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10005,10 +10067,10 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10016,11 +10078,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10028,11 +10090,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10040,11 +10102,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10052,11 +10114,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -10064,11 +10126,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -10076,11 +10138,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10088,29 +10150,27 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10249,9 +10309,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 @@ -10266,11 +10327,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 @@ -10280,10 +10340,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 @@ -10293,10 +10353,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 @@ -10311,7 +10371,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10322,7 +10382,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10330,11 +10390,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10342,10 +10402,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -10443,65 +10503,65 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s4, s28, 16 ; VI-NEXT: v_or_b32_e32 v13, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: s_lshr_b32 s4, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_add_f16_e32 v4, s27, v0 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_add_f16_e32 v6, s26, v0 +; VI-NEXT: v_or_b32_e32 v11, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v5, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v6, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v14, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v14, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v17, s18, v0 ; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v15, v1 +; VI-NEXT: v_add_f16_sdwa v1, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v15, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_or_b32_e32 v2, v17, v1 +; VI-NEXT: v_or_b32_e32 v1, v14, v15 ; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -10871,6 +10931,7 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v5, s19 ; SI-NEXT: v_mov_b32_e32 v9, s21 ; SI-NEXT: v_mov_b32_e32 v13, s23 @@ -10883,7 +10944,6 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v12, s22 ; SI-NEXT: v_mov_b32_e32 v8, s20 ; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v7, s7 @@ -11040,30 +11100,38 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v37, v15 ; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 ; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 ; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 ; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 ; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 ; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v51, v3 ; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB50_3 @@ -11089,47 +11157,47 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true @@ -11172,10 +11240,10 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -11344,14 +11412,16 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff @@ -11371,11 +11441,9 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -11950,40 +12018,39 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -11991,13 +12058,14 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -12162,108 +12230,122 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v3 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v10, v20, v10 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12271,10 +12353,10 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12282,11 +12364,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12294,11 +12376,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -12306,11 +12388,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -12318,11 +12400,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12330,11 +12412,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -12342,11 +12424,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -12354,29 +12436,27 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12515,9 +12595,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 @@ -12532,11 +12613,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 @@ -12546,10 +12626,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 @@ -12559,10 +12639,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 @@ -12577,7 +12657,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12588,7 +12668,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12596,11 +12676,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12608,10 +12688,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -12709,65 +12789,65 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; VI-NEXT: s_lshr_b32 s4, s28, 16 ; VI-NEXT: v_or_b32_e32 v13, v2, v1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: s_lshr_b32 s4, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v2, s28, v0 ; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: v_add_f16_e32 v4, s27, v0 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: v_add_f16_e32 v6, s26, v0 +; VI-NEXT: v_or_b32_e32 v11, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_or_b32_e32 v10, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: v_add_f16_e32 v5, s24, v0 +; VI-NEXT: v_or_b32_e32 v9, v3, v1 +; VI-NEXT: v_add_f16_sdwa v1, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: v_add_f16_e32 v6, s23, v0 +; VI-NEXT: v_or_b32_e32 v8, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: v_or_b32_e32 v7, v6, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_or_b32_e32 v6, v4, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_e32 v14, s20, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, s19, v0 +; VI-NEXT: v_or_b32_e32 v4, v14, v1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v17, s18, v0 ; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: v_or_b32_e32 v3, v15, v1 +; VI-NEXT: v_add_f16_sdwa v1, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v15, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_or_b32_e32 v2, v17, v1 +; VI-NEXT: v_or_b32_e32 v1, v14, v15 ; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v0, s16, v0 @@ -13114,43 +13194,43 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 3 -; VI-NEXT: v_add_u16_sdwa v19, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 3, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v19 -; VI-NEXT: v_add_u16_sdwa v19, v12, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v12, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 3, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v19 -; VI-NEXT: v_add_u16_sdwa v19, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v20 +; VI-NEXT: v_add_u16_sdwa v20, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v21 ; VI-NEXT: v_add_u16_e32 v11, 3, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v19 -; VI-NEXT: v_add_u16_sdwa v19, v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 3, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_add_u16_sdwa v19, v9, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v20 +; VI-NEXT: v_add_u16_sdwa v20, v9, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v21 ; VI-NEXT: v_add_u16_e32 v9, 3, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_add_u16_sdwa v19, v8, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v8, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v19 -; VI-NEXT: v_add_u16_sdwa v19, v7, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v19 -; VI-NEXT: v_add_u16_sdwa v19, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_add_u16_sdwa v15, v0, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v16, v1, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v17, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v19 ; VI-NEXT: v_add_u16_sdwa v19, v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v5, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v20 +; VI-NEXT: v_add_u16_sdwa v20, v7, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v21 +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_add_u16_sdwa v21, v5, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v5, v5, v14 +; VI-NEXT: v_or_b32_e32 v7, v7, v20 +; VI-NEXT: v_or_b32_e32 v6, v6, v14 +; VI-NEXT: v_or_b32_e32 v5, v5, v21 ; VI-NEXT: v_or_b32_e32 v4, v4, v19 ; VI-NEXT: v_or_b32_e32 v3, v3, v18 ; VI-NEXT: v_or_b32_e32 v2, v2, v17 @@ -13614,28 +13694,18 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -13643,11 +13713,10 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -13655,21 +13724,27 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13677,9 +13752,9 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -13693,10 +13768,10 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -13704,10 +13779,16 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 ; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 ; SI-NEXT: v_or_b32_e32 v2, v2, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -13715,21 +13796,20 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v29, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -13744,43 +13824,43 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v13 ; VI-NEXT: v_add_f16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v19, v13 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v12 ; VI-NEXT: v_add_f16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v19, v12 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v13, v20, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v12, v21, v12 ; VI-NEXT: v_add_f16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 ; VI-NEXT: v_add_f16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 ; VI-NEXT: v_add_f16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 -; VI-NEXT: v_add_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v19, v7 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v14, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v19, v6 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 +; VI-NEXT: v_add_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v15, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v15 +; VI-NEXT: v_add_f16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v15, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v7, v20, v7 +; VI-NEXT: v_or_b32_e32 v6, v6, v15 +; VI-NEXT: v_or_b32_e32 v5, v21, v5 ; VI-NEXT: v_or_b32_e32 v4, v19, v4 ; VI-NEXT: v_or_b32_e32 v3, v18, v3 ; VI-NEXT: v_or_b32_e32 v2, v17, v2 @@ -13866,33 +13946,27 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 ; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 @@ -13901,6 +13975,21 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -13909,28 +13998,18 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -13938,11 +14017,10 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -13950,21 +14028,27 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13972,9 +14056,9 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -13988,10 +14072,10 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -13999,10 +14083,16 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 ; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 ; SI-NEXT: v_or_b32_e32 v2, v2, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -14010,21 +14100,20 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v29, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: @@ -14039,77 +14128,77 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_4 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s28, 16 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s29, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s29, 16 -; VI-NEXT: v_add_f16_e32 v1, s28, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s27, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s26, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_add_f16_e32 v3, s29, v0 ; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: s_lshr_b32 s4, s27, 16 ; VI-NEXT: v_or_b32_e32 v13, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 ; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v12, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v11, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v10, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v9, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v8, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v6, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v5, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, v4, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 +; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, s4 +; VI-NEXT: v_add_f16_e32 v17, s18, v0 ; VI-NEXT: v_add_f16_e32 v14, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_or_b32_e32 v0, v14, v15 +; VI-NEXT: v_add_f16_e32 v15, s17, v0 +; VI-NEXT: v_or_b32_e32 v2, v17, v2 +; VI-NEXT: v_add_f16_sdwa v17, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v15, v0 +; VI-NEXT: v_or_b32_e32 v0, v14, v17 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_3: ; VI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 35d135b123969..69b3922187011 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2394,8 +2394,8 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -2419,9 +2419,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2444,50 +2444,50 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -2528,8 +2528,8 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: v_or_b32_e32 v4, v54, v4 @@ -2731,9 +2731,11 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff @@ -2763,11 +2765,9 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -3085,39 +3085,33 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v16i32_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v15 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v36, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_mov_b32_e32 v39, v8 -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v5 -; SI-NEXT: v_mov_b32_e32 v51, v4 -; SI-NEXT: v_mov_b32_e32 v52, v3 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v1 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -3136,128 +3130,124 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -3267,15 +3257,27 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_mov_b32_e32 v5, v53 +; SI-NEXT: v_mov_b32_e32 v7, v52 +; SI-NEXT: v_mov_b32_e32 v9, v51 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_mov_b32_e32 v13, v48 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32f16: @@ -3697,19 +3699,17 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 @@ -3733,9 +3733,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 @@ -3745,7 +3743,9 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 @@ -3767,50 +3767,50 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 @@ -3819,41 +3819,41 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -3948,15 +3948,14 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4103,8 +4102,9 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 @@ -4124,12 +4124,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 @@ -4140,10 +4139,10 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 @@ -4156,10 +4155,10 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 ; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: v_or_b32_e32 v6, v35, v6 @@ -4174,10 +4173,10 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -4186,21 +4185,21 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 @@ -4502,23 +4501,24 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v16i32_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4561,41 +4561,35 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -4609,25 +4603,31 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 @@ -5082,23 +5082,21 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 @@ -5117,26 +5115,26 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 @@ -5149,79 +5147,79 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB22_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -5286,7 +5284,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -5294,22 +5292,21 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; SI-NEXT: .LBB22_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6467,9 +6464,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 @@ -6488,27 +6486,26 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 @@ -6518,13 +6515,13 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 ; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v52, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v48, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v38, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 ; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 ; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 @@ -6536,44 +6533,44 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v49 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -6642,11 +6639,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -6670,279 +6667,279 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v1, v2, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v16, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v16 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v16 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v16, v3 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v17, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v17 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v2, v18, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v18 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v18 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v17 +; VI-NEXT: v_bfe_u32 v2, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v19 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v18, v19, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v19 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; VI-NEXT: s_branch .LBB23_5 ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -6964,10 +6961,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6976,11 +6973,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -7004,296 +7001,296 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v14, v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v19, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v0, v18, 16, v0 ; GFX9-NEXT: s_branch .LBB23_5 ; GFX9-NEXT: .LBB23_3: ; GFX9-NEXT: s_branch .LBB23_2 @@ -7315,10 +7312,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB23_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9380,13 +9377,10 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v9, s26 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 -; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v18, s20 ; SI-NEXT: v_mov_b32_e32 v21, s18 @@ -9397,11 +9391,14 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v9, s27, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v12, s27, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s25, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 8 ; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v14, 16 ; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 @@ -9438,29 +9435,26 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s63, s17, 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v9, s26 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 -; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v18, s20 ; SI-NEXT: v_mov_b32_e32 v21, s18 @@ -9471,11 +9465,14 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v9, s27, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v12, s27, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s25, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 8 ; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v14, 16 ; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 @@ -9518,18 +9515,16 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s61, 24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_mov_b32_e32 v24, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v21, s4, v21 @@ -9542,17 +9537,18 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff @@ -9584,70 +9580,70 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v14, s4, v14 ; SI-NEXT: s_and_b32 s4, s23, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s44, 24 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s41, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v10 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v12 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: s_and_b32 s4, s27, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s15, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen @@ -9709,28 +9705,29 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 @@ -9744,10 +9741,9 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -17171,8 +17167,8 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -17196,9 +17192,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17221,50 +17217,50 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -17305,8 +17301,8 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: v_or_b32_e32 v4, v54, v4 @@ -17508,9 +17504,11 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff @@ -17540,11 +17538,9 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -17862,39 +17858,33 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v16f32_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v15 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v36, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_mov_b32_e32 v39, v8 -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v5 -; SI-NEXT: v_mov_b32_e32 v51, v4 -; SI-NEXT: v_mov_b32_e32 v52, v3 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v1 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -17913,128 +17903,124 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_4 -; SI-NEXT: .LBB40_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB40_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 -; SI-NEXT: .LBB40_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -18044,15 +18030,27 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 +; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_mov_b32_e32 v5, v53 +; SI-NEXT: v_mov_b32_e32 v7, v52 +; SI-NEXT: v_mov_b32_e32 v9, v51 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_mov_b32_e32 v13, v48 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: @@ -18470,19 +18468,17 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 @@ -18506,9 +18502,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 @@ -18518,7 +18512,9 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 @@ -18540,50 +18536,50 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 @@ -18592,41 +18588,41 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -18721,15 +18717,14 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -18876,8 +18871,9 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 @@ -18897,12 +18893,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 @@ -18913,10 +18908,10 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 @@ -18929,10 +18924,10 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 ; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: v_or_b32_e32 v6, v35, v6 @@ -18947,10 +18942,10 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -18959,21 +18954,21 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 @@ -19275,23 +19270,24 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v16f32_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -19334,41 +19330,35 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -19382,25 +19372,31 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 @@ -19833,23 +19829,21 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 @@ -19868,26 +19862,26 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 @@ -19900,79 +19894,79 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -20037,7 +20031,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -20045,22 +20039,21 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21218,9 +21211,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 @@ -21239,27 +21233,26 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 @@ -21269,13 +21262,13 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 ; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v52, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v48, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v38, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 ; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 ; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 @@ -21287,44 +21280,44 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v49 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -21393,11 +21386,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -21421,279 +21414,279 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v1, v2, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v16, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v16 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v16 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v16, v3 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v17, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v17 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v2, v18, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v18 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v18 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v17 +; VI-NEXT: v_bfe_u32 v2, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v19 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v18, v19, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v19 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; VI-NEXT: s_branch .LBB47_5 ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -21715,10 +21708,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -21727,11 +21720,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -21755,296 +21748,296 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v14, v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v19, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v0, v18, 16, v0 ; GFX9-NEXT: s_branch .LBB47_5 ; GFX9-NEXT: .LBB47_3: ; GFX9-NEXT: s_branch .LBB47_2 @@ -22066,10 +22059,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB47_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -24111,12 +24104,12 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v24, s17 ; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_mov_b32_e32 v19, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 ; SI-NEXT: v_mov_b32_e32 v14, s21 -; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v9, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v7, s25 @@ -24143,8 +24136,8 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v2, v1, 24 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 ; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 @@ -24156,20 +24149,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 ; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 ; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 -; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 -; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v10, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v10, 8 ; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 ; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 ; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 ; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 -; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v40, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 -; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 -; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: v_alignbit_b32 v46, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v24, v28, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_alignbit_b32 v58, v24, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v2 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 @@ -24181,7 +24174,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v9 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 @@ -24194,31 +24187,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v24 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v2, v1, 24 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 ; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 @@ -24230,20 +24223,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 ; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 ; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 -; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 -; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v10, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v10, 8 ; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 ; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 ; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 ; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 -; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v40, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 -; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 -; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: v_alignbit_b32 v46, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v24, v28, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_alignbit_b32 v58, v24, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v2 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 @@ -24255,7 +24248,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v9 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 @@ -24268,11 +24261,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v24 ; SI-NEXT: .LBB49_3: ; %end ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt expcnt(5) @@ -24283,70 +24276,70 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 ; SI-NEXT: v_or_b32_e32 v46, v46, v56 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; SI-NEXT: v_or_b32_e32 v28, v28, v46 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v63 +; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v63 ; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v62 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v25, v28, v25 -; SI-NEXT: v_or_b32_e32 v10, v10, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v24, v28, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v11, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v20, v25, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v55 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v11, v11, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v61 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v59 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v52 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v49 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v15, v19, v15 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v57 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v45 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 @@ -24363,7 +24356,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v40 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v41 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 @@ -24461,7 +24454,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v25 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -24486,15 +24479,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -24510,7 +24503,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -24531,10 +24524,10 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; kill: killed $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v16f32_to_v64i8_scalar: @@ -24640,7 +24633,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 ; VI-NEXT: v_add_f32_e64 v8, s25, 1.0 ; VI-NEXT: v_add_f32_e64 v7, s24, 1.0 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] ; VI-NEXT: v_add_f32_e64 v10, s23, 1.0 ; VI-NEXT: v_add_f32_e64 v9, s22, 1.0 @@ -24649,21 +24642,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_add_f32_e64 v11, s20, 1.0 ; VI-NEXT: v_add_f32_e64 v4, s29, 1.0 ; VI-NEXT: v_add_f32_e64 v3, s28, 1.0 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] -; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v14, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] ; VI-NEXT: v_add_f32_e64 v18, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v17, s16, 1.0 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] ; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -24692,14 +24685,14 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v18 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v17 ; VI-NEXT: s_branch .LBB49_5 @@ -24759,28 +24752,28 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v20, s42 ; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v13, s18 ; VI-NEXT: v_mov_b32_e32 v11, s20 -; VI-NEXT: v_mov_b32_e32 v12, s21 ; VI-NEXT: v_mov_b32_e32 v9, s22 -; VI-NEXT: v_mov_b32_e32 v10, s23 ; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v8, s25 ; VI-NEXT: v_mov_b32_e32 v5, s26 -; VI-NEXT: v_mov_b32_e32 v6, s27 ; VI-NEXT: v_mov_b32_e32 v3, s28 -; VI-NEXT: v_mov_b32_e32 v4, s29 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v13, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v15, s66 +; VI-NEXT: v_mov_b32_e32 v58, s64 +; VI-NEXT: v_mov_b32_e32 v60, s55 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v6, s27 +; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v59, s54 +; VI-NEXT: v_mov_b32_e32 v61, s52 ; VI-NEXT: v_mov_b32_e32 v57, s53 ; VI-NEXT: v_mov_b32_e32 v47, s51 ; VI-NEXT: v_mov_b32_e32 v56, s50 @@ -24812,7 +24805,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v29, s60 ; VI-NEXT: v_mov_b32_e32 v28, s58 ; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v14, s57 +; VI-NEXT: v_mov_b32_e32 v16, s57 ; VI-NEXT: v_mov_b32_e32 v26, s56 ; VI-NEXT: v_mov_b32_e32 v22, s12 ; VI-NEXT: v_mov_b32_e32 v23, s10 @@ -24823,33 +24816,33 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v20, s40 ; VI-NEXT: v_mov_b32_e32 v21, s14 ; VI-NEXT: .LBB49_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v25 ; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v62, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v60 +; VI-NEXT: v_or_b32_sdwa v17, v58, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v17, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; VI-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v59 +; VI-NEXT: v_or_b32_sdwa v15, v61, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; VI-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 @@ -24956,7 +24949,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -25080,32 +25073,32 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 ; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0 ; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] ; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0 ; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0 ; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0 ; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0 ; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v14, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v19, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v18, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -25134,16 +25127,16 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v18 ; GFX9-NEXT: s_branch .LBB49_5 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -25196,34 +25189,34 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB49_2 ; GFX9-NEXT: .LBB49_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s44 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 ; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 ; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 ; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 ; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 ; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v16, s55 +; GFX9-NEXT: v_mov_b32_e32 v61, s53 +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: v_mov_b32_e32 v59, s52 +; GFX9-NEXT: v_mov_b32_e32 v60, s51 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 -; GFX9-NEXT: v_mov_b32_e32 v60, s52 -; GFX9-NEXT: v_mov_b32_e32 v61, s51 ; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v62, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -25255,45 +25248,45 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v22, s12 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s8 +; GFX9-NEXT: v_mov_b32_e32 v25, s6 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 ; GFX9-NEXT: .LBB49_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v18, v61, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v59, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v15, v62, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -25305,7 +25298,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -25317,7 +25310,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -25329,7 +25322,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -25381,8 +25374,8 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -31396,8 +31389,8 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -31421,9 +31414,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -31446,50 +31439,50 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -31530,8 +31523,8 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: v_or_b32_e32 v4, v54, v4 @@ -31733,9 +31726,11 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff @@ -31765,11 +31760,9 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -32087,39 +32080,33 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v8i64_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v15 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v36, v14 ; SI-NEXT: v_mov_b32_e32 v34, v12 -; SI-NEXT: v_mov_b32_e32 v37, v11 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v39, v9 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v49, v7 -; SI-NEXT: v_mov_b32_e32 v48, v6 -; SI-NEXT: v_mov_b32_e32 v51, v5 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v53, v3 -; SI-NEXT: v_mov_b32_e32 v52, v2 -; SI-NEXT: v_mov_b32_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v54, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -32138,126 +32125,122 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v33, vcc -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v37 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v33 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -32269,15 +32252,27 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_mov_b32_e32 v5, v53 +; SI-NEXT: v_mov_b32_e32 v7, v52 +; SI-NEXT: v_mov_b32_e32 v9, v51 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_mov_b32_e32 v13, v48 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: @@ -32703,19 +32698,17 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 @@ -32739,9 +32732,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 @@ -32751,7 +32742,9 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 @@ -32773,50 +32766,50 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 @@ -32825,41 +32818,41 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -32954,15 +32947,14 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33109,8 +33101,9 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 @@ -33130,12 +33123,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 @@ -33146,10 +33138,10 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 @@ -33162,10 +33154,10 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 ; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: v_or_b32_e32 v6, v35, v6 @@ -33180,10 +33172,10 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33192,21 +33184,21 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 @@ -33508,23 +33500,24 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v8i64_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -33567,41 +33560,35 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -33615,25 +33602,31 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v36, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v37 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v38, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v39 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v48, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v52, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v54, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v34, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v32, vcc ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 @@ -34092,23 +34085,21 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 @@ -34127,26 +34118,26 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 @@ -34159,79 +34150,79 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -34296,7 +34287,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -34304,22 +34295,21 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35477,9 +35467,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 @@ -35498,27 +35489,26 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 @@ -35528,13 +35518,13 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 ; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v52, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v48, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v38, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 ; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 ; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 @@ -35546,44 +35536,44 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v49 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -35652,11 +35642,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -35680,279 +35670,279 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v1, v2, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v16, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v16 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v16 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v16, v3 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v17, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v17 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v2, v18, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v18 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v18 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v17 +; VI-NEXT: v_bfe_u32 v2, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v19 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v18, v19, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v19 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; VI-NEXT: s_branch .LBB67_5 ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -35974,10 +35964,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35986,11 +35976,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -36014,296 +36004,296 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v14, v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v19, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v0, v18, 16, v0 ; GFX9-NEXT: s_branch .LBB67_5 ; GFX9-NEXT: .LBB67_3: ; GFX9-NEXT: s_branch .LBB67_2 @@ -36325,10 +36315,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB67_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -38402,8 +38392,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v18, s20 ; SI-NEXT: v_mov_b32_e32 v21, s18 @@ -38414,14 +38404,14 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s25, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v14, 16 ; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 @@ -38476,8 +38466,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_addc_u32 s6, s6, 0 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v18, s20 ; SI-NEXT: v_mov_b32_e32 v21, s18 @@ -38488,14 +38478,14 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 ; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 ; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v7, 24 +; SI-NEXT: v_alignbit_b32 v10, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v7, 8 +; SI-NEXT: v_alignbit_b32 v15, s25, v8, 24 +; SI-NEXT: v_alignbit_b32 v7, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 8 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v14, 16 ; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 ; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 ; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 @@ -38538,18 +38528,16 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s61, 24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_mov_b32_e32 v24, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v21, s4, v21 @@ -38562,17 +38550,18 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff @@ -38604,72 +38593,72 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_or_b32_e32 v14, s4, v14 ; SI-NEXT: s_and_b32 s4, s23, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s44, 24 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s16, s41, 24 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: s_and_b32 s4, s27, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s15, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff @@ -38729,28 +38718,29 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 @@ -38764,10 +38754,9 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -44683,13 +44672,13 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 -; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v39, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v38, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v34, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 @@ -44709,13 +44698,13 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 -; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v39, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v38, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v34, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 @@ -44731,31 +44720,31 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v14, v13 ; SI-NEXT: v_mov_b32_e32 v18, v17 ; SI-NEXT: v_mov_b32_e32 v22, v21 -; SI-NEXT: v_mov_b32_e32 v26, v25 ; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v1, v30 +; SI-NEXT: v_mov_b32_e32 v5, v34 +; SI-NEXT: v_mov_b32_e32 v9, v35 +; SI-NEXT: v_mov_b32_e32 v13, v36 +; SI-NEXT: v_mov_b32_e32 v17, v37 +; SI-NEXT: v_mov_b32_e32 v21, v38 +; SI-NEXT: v_mov_b32_e32 v25, v39 ; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v1, v34 -; SI-NEXT: v_mov_b32_e32 v5, v35 -; SI-NEXT: v_mov_b32_e32 v9, v36 -; SI-NEXT: v_mov_b32_e32 v13, v37 -; SI-NEXT: v_mov_b32_e32 v17, v38 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -44900,8 +44889,8 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -44925,9 +44914,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -44950,50 +44939,50 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -45034,8 +45023,8 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: v_or_b32_e32 v4, v54, v4 @@ -45237,9 +45226,11 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff @@ -45269,11 +45260,9 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -45591,23 +45580,27 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v8f64_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -45628,61 +45621,64 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v42 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB76_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -45697,13 +45693,13 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 @@ -45720,14 +45716,14 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -45736,32 +45732,37 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v32 ; SI-NEXT: v_mov_b32_e32 v1, v55 -; SI-NEXT: v_mov_b32_e32 v2, v53 -; SI-NEXT: v_mov_b32_e32 v3, v52 -; SI-NEXT: v_mov_b32_e32 v4, v51 -; SI-NEXT: v_mov_b32_e32 v5, v49 -; SI-NEXT: v_mov_b32_e32 v6, v50 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: v_mov_b32_e32 v8, v48 -; SI-NEXT: v_mov_b32_e32 v9, v36 -; SI-NEXT: v_mov_b32_e32 v10, v38 -; SI-NEXT: v_mov_b32_e32 v11, v34 +; SI-NEXT: v_mov_b32_e32 v2, v39 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_mov_b32_e32 v5, v53 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v52 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v9, v51 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v11, v50 ; SI-NEXT: v_mov_b32_e32 v12, v37 -; SI-NEXT: v_mov_b32_e32 v13, v32 -; SI-NEXT: v_mov_b32_e32 v14, v35 -; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_mov_b32_e32 v13, v49 +; SI-NEXT: v_mov_b32_e32 v14, v38 +; SI-NEXT: v_mov_b32_e32 v15, v48 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: @@ -45905,45 +45906,44 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -45952,14 +45952,15 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: .LBB77_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: @@ -46131,19 +46132,17 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 @@ -46167,9 +46166,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 @@ -46179,7 +46176,9 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 @@ -46201,50 +46200,50 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 @@ -46253,41 +46252,41 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -46382,15 +46381,14 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB78_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -46537,8 +46535,9 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 @@ -46558,12 +46557,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 @@ -46574,10 +46572,10 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 @@ -46590,10 +46588,10 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 ; SI-NEXT: v_or_b32_e32 v4, v39, v4 ; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: v_or_b32_e32 v6, v35, v6 @@ -46608,10 +46606,10 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -46620,21 +46618,21 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 @@ -46936,11 +46934,11 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v8f64_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -46953,6 +46951,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -46976,7 +46975,6 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 @@ -46989,29 +46987,30 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -47041,40 +47040,40 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v1 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v55 -; SI-NEXT: v_mov_b32_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v2, v53 -; SI-NEXT: v_mov_b32_e32 v3, v52 -; SI-NEXT: v_mov_b32_e32 v4, v51 -; SI-NEXT: v_mov_b32_e32 v5, v50 -; SI-NEXT: v_mov_b32_e32 v6, v49 -; SI-NEXT: v_mov_b32_e32 v7, v48 -; SI-NEXT: v_mov_b32_e32 v8, v39 -; SI-NEXT: v_mov_b32_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v11, v36 -; SI-NEXT: v_mov_b32_e32 v12, v35 -; SI-NEXT: v_mov_b32_e32 v13, v34 -; SI-NEXT: v_mov_b32_e32 v14, v33 -; SI-NEXT: v_mov_b32_e32 v15, v32 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v3, v51 +; SI-NEXT: v_mov_b32_e32 v4, v50 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v48 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v11, v35 +; SI-NEXT: v_mov_b32_e32 v12, v34 +; SI-NEXT: v_mov_b32_e32 v13, v33 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: v_mov_b32_e32 v15, v53 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32bf16: @@ -47426,23 +47425,21 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 @@ -47461,26 +47458,26 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 @@ -47493,79 +47490,79 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB82_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -47630,7 +47627,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -47638,22 +47635,21 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; SI-NEXT: .LBB82_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -48811,9 +48807,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 @@ -48832,27 +48829,26 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 @@ -48862,13 +48858,13 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 ; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v52, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v48, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v38, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 ; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 ; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 @@ -48880,44 +48876,44 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v49 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -48986,11 +48982,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -49014,279 +49010,279 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v14, v1, v2, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v5 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v3 +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v6, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; VI-NEXT: v_bfe_u32 v2, v16, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v16 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v16 ; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v16, v3 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v2, v17, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v17 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v2, v18, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v18 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v18 ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v17 +; VI-NEXT: v_bfe_u32 v2, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v19 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; VI-NEXT: v_bfe_u32 v18, v19, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v19 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; VI-NEXT: s_branch .LBB83_5 ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -49308,10 +49304,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -49320,11 +49316,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -49348,296 +49344,296 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v14, v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v19, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v0, v18, 16, v0 ; GFX9-NEXT: s_branch .LBB83_5 ; GFX9-NEXT: .LBB83_3: ; GFX9-NEXT: s_branch .LBB83_2 @@ -49659,10 +49655,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB83_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -51685,11 +51681,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 -; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 -; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v20, s5, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s28 -; SI-NEXT: v_alignbit_b32 v20, s29, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s29, v1, 24 ; SI-NEXT: v_alignbit_b32 v4, s29, v1, 16 ; SI-NEXT: v_alignbit_b32 v19, s29, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s26 @@ -51706,10 +51702,10 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v26, s23, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s20 ; SI-NEXT: v_alignbit_b32 v12, s21, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, s21, v1, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 +; SI-NEXT: v_alignbit_b32 v27, s21, v1, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s18 -; SI-NEXT: v_alignbit_b32 v27, s19, v1, 24 +; SI-NEXT: v_alignbit_b32 v16, s19, v1, 24 ; SI-NEXT: v_alignbit_b32 v28, s19, v1, 16 ; SI-NEXT: v_alignbit_b32 v29, s19, v1, 8 ; SI-NEXT: v_mov_b32_e32 v1, s16 @@ -51758,10 +51754,10 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s21, v12 ; SI-NEXT: v_readfirstlane_b32 s19, v14 ; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 -; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 -; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 -; SI-NEXT: v_alignbit_b32 v20, s29, v3, 24 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v20, s5, v1, 8 +; SI-NEXT: v_alignbit_b32 v2, s29, v3, 24 ; SI-NEXT: v_alignbit_b32 v4, s29, v3, 16 ; SI-NEXT: v_alignbit_b32 v19, s29, v3, 8 ; SI-NEXT: v_alignbit_b32 v6, s27, v5, 24 @@ -51774,6 +51770,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v25, s23, v9, 16 ; SI-NEXT: v_alignbit_b32 v26, s23, v9, 8 ; SI-NEXT: v_alignbit_b32 v12, s21, v11, 24 +; SI-NEXT: v_alignbit_b32 v27, s21, v11, 16 ; SI-NEXT: s_lshr_b32 s8, s5, 24 ; SI-NEXT: s_lshr_b32 s9, s5, 16 ; SI-NEXT: s_lshr_b32 s10, s5, 8 @@ -51798,9 +51795,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s61, s17, 24 ; SI-NEXT: s_lshr_b32 s62, s17, 16 ; SI-NEXT: s_lshr_b32 s63, s17, 8 -; SI-NEXT: v_alignbit_b32 v14, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v11, 8 -; SI-NEXT: v_alignbit_b32 v27, s19, v13, 24 +; SI-NEXT: v_alignbit_b32 v14, s21, v11, 8 +; SI-NEXT: v_alignbit_b32 v16, s19, v13, 24 ; SI-NEXT: v_alignbit_b32 v28, s19, v13, 16 ; SI-NEXT: v_alignbit_b32 v29, s19, v13, 8 ; SI-NEXT: v_alignbit_b32 v30, s17, v15, 24 @@ -51811,41 +51807,42 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr11 @@ -51855,7 +51852,6 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -51899,27 +51895,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s58, 24 -; SI-NEXT: v_or_b32_e32 v15, v27, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v28, v15, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v14 ; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s6, s57, 8 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v27 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -52019,35 +52015,36 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: v_or_b32_e32 v3, v3, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s11, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s8, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -52143,70 +52140,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 ; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; VI-NEXT: v_add_f64 v[7:8], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[17:18], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; VI-NEXT: v_readfirstlane_b32 s17, v14 -; VI-NEXT: v_readfirstlane_b32 s19, v10 -; VI-NEXT: v_readfirstlane_b32 s21, v16 -; VI-NEXT: v_readfirstlane_b32 s23, v12 -; VI-NEXT: v_readfirstlane_b32 s25, v8 -; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; VI-NEXT: v_readfirstlane_b32 s17, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s25, v10 +; VI-NEXT: v_readfirstlane_b32 s27, v8 ; VI-NEXT: v_readfirstlane_b32 s29, v4 ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 ; VI-NEXT: s_lshr_b32 s59, s29, 24 ; VI-NEXT: s_lshr_b32 s60, s29, 16 ; VI-NEXT: s_lshr_b32 s61, s29, 8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 ; VI-NEXT: s_lshr_b32 s62, s27, 24 ; VI-NEXT: s_lshr_b32 s63, s27, 16 ; VI-NEXT: s_lshr_b32 s72, s27, 8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_lshr_b32 s73, s25, 24 ; VI-NEXT: s_lshr_b32 s74, s25, 16 ; VI-NEXT: s_lshr_b32 s75, s25, 8 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 ; VI-NEXT: s_lshr_b32 s76, s23, 24 ; VI-NEXT: s_lshr_b32 s77, s23, 16 ; VI-NEXT: s_lshr_b32 s78, s23, 8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 ; VI-NEXT: s_lshr_b32 s79, s21, 24 ; VI-NEXT: s_lshr_b32 s88, s21, 16 ; VI-NEXT: s_lshr_b32 s89, s21, 8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 ; VI-NEXT: s_lshr_b32 s90, s19, 24 ; VI-NEXT: s_lshr_b32 s91, s19, 16 ; VI-NEXT: s_lshr_b32 s30, s19, 8 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 ; VI-NEXT: s_lshr_b32 s31, s17, 24 ; VI-NEXT: s_lshr_b32 s34, s17, 16 ; VI-NEXT: s_lshr_b32 s35, s17, 8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v17 ; VI-NEXT: s_branch .LBB85_5 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr66 @@ -52259,10 +52256,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB85_2 ; VI-NEXT: .LBB85_4: -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v11, s20 ; VI-NEXT: v_mov_b32_e32 v48, s67 -; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v2, s66 ; VI-NEXT: v_mov_b32_e32 v38, s65 ; VI-NEXT: v_mov_b32_e32 v39, s64 ; VI-NEXT: v_mov_b32_e32 v36, s55 @@ -52275,22 +52273,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v31, s48 ; VI-NEXT: v_mov_b32_e32 v28, s39 ; VI-NEXT: v_mov_b32_e32 v29, s38 -; VI-NEXT: v_mov_b32_e32 v26, s37 -; VI-NEXT: v_mov_b32_e32 v27, s36 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s37 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v7, s26 ; VI-NEXT: v_mov_b32_e32 v3, s28 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v18, s42 -; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: v_mov_b32_e32 v25, s6 +; VI-NEXT: v_mov_b32_e32 v27, s36 +; VI-NEXT: v_mov_b32_e32 v24, s8 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v19, s42 +; VI-NEXT: v_mov_b32_e32 v5, s44 ; VI-NEXT: .LBB85_5: ; %end ; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s6, s35, 8 @@ -52298,11 +52295,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s34, 0xff ; VI-NEXT: s_lshl_b32 s7, s31, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v25 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -52316,9 +52313,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s90, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -52329,34 +52326,34 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_and_b32 s4, s21, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff ; VI-NEXT: s_lshl_b32 s7, s79, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: s_and_b32 s4, s23, 0xff ; VI-NEXT: s_lshl_b32 s6, s78, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 16, v0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 20, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s77, 0xff ; VI-NEXT: s_lshl_b32 s7, s76, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; VI-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -52373,9 +52370,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s73, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -52392,9 +52389,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s62, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -52412,7 +52409,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s60, 0xff ; VI-NEXT: s_lshl_b32 s7, s59, 8 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -52431,9 +52428,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_and_b32 s5, s57, 0xff ; VI-NEXT: s_lshl_b32 s6, s56, 8 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -52547,70 +52544,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX9-NEXT: v_add_f64 v[7:8], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; GFX9-NEXT: v_readfirstlane_b32 s17, v14 -; GFX9-NEXT: v_readfirstlane_b32 s19, v12 -; GFX9-NEXT: v_readfirstlane_b32 s21, v16 -; GFX9-NEXT: v_readfirstlane_b32 s23, v10 -; GFX9-NEXT: v_readfirstlane_b32 s25, v8 -; GFX9-NEXT: v_readfirstlane_b32 s27, v6 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 +; GFX9-NEXT: v_readfirstlane_b32 s19, v16 +; GFX9-NEXT: v_readfirstlane_b32 s21, v14 +; GFX9-NEXT: v_readfirstlane_b32 s23, v12 +; GFX9-NEXT: v_readfirstlane_b32 s25, v10 +; GFX9-NEXT: v_readfirstlane_b32 s27, v8 ; GFX9-NEXT: v_readfirstlane_b32 s29, v4 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v17 ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 ; GFX9-NEXT: s_lshr_b32 s59, s29, 24 ; GFX9-NEXT: s_lshr_b32 s60, s29, 16 ; GFX9-NEXT: s_lshr_b32 s61, s29, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 ; GFX9-NEXT: s_lshr_b32 s62, s27, 24 ; GFX9-NEXT: s_lshr_b32 s63, s27, 16 ; GFX9-NEXT: s_lshr_b32 s72, s27, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 ; GFX9-NEXT: s_lshr_b32 s73, s25, 24 ; GFX9-NEXT: s_lshr_b32 s74, s25, 16 ; GFX9-NEXT: s_lshr_b32 s75, s25, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 ; GFX9-NEXT: s_lshr_b32 s76, s23, 24 ; GFX9-NEXT: s_lshr_b32 s77, s23, 16 ; GFX9-NEXT: s_lshr_b32 s78, s23, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 ; GFX9-NEXT: s_lshr_b32 s79, s21, 24 ; GFX9-NEXT: s_lshr_b32 s88, s21, 16 ; GFX9-NEXT: s_lshr_b32 s89, s21, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 ; GFX9-NEXT: s_lshr_b32 s90, s19, 24 ; GFX9-NEXT: s_lshr_b32 s91, s19, 16 ; GFX9-NEXT: s_lshr_b32 s92, s19, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 ; GFX9-NEXT: s_lshr_b32 s93, s17, 24 ; GFX9-NEXT: s_lshr_b32 s94, s17, 16 ; GFX9-NEXT: s_lshr_b32 s95, s17, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v17 ; GFX9-NEXT: s_branch .LBB85_5 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr54 @@ -52663,10 +52660,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB85_2 ; GFX9-NEXT: .LBB85_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s16 -; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 ; GFX9-NEXT: v_mov_b32_e32 v39, s55 -; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v2, s54 ; GFX9-NEXT: v_mov_b32_e32 v37, s53 ; GFX9-NEXT: v_mov_b32_e32 v48, s52 ; GFX9-NEXT: v_mov_b32_e32 v36, s51 @@ -52679,36 +52677,35 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v31, s36 ; GFX9-NEXT: v_mov_b32_e32 v28, s35 ; GFX9-NEXT: v_mov_b32_e32 v29, s34 -; GFX9-NEXT: v_mov_b32_e32 v26, s31 -; GFX9-NEXT: v_mov_b32_e32 v27, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s31 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s26 ; GFX9-NEXT: v_mov_b32_e32 v3, s28 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v24, s6 -; GFX9-NEXT: v_mov_b32_e32 v23, s8 -; GFX9-NEXT: v_mov_b32_e32 v22, s10 -; GFX9-NEXT: v_mov_b32_e32 v21, s12 -; GFX9-NEXT: v_mov_b32_e32 v20, s14 -; GFX9-NEXT: v_mov_b32_e32 v19, s40 -; GFX9-NEXT: v_mov_b32_e32 v18, s42 -; GFX9-NEXT: v_mov_b32_e32 v17, s44 +; GFX9-NEXT: v_mov_b32_e32 v25, s6 +; GFX9-NEXT: v_mov_b32_e32 v27, s30 +; GFX9-NEXT: v_mov_b32_e32 v24, s8 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v22, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s42 +; GFX9-NEXT: v_mov_b32_e32 v5, s44 ; GFX9-NEXT: .LBB85_5: ; %end ; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s95, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s94, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s93, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v25 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -52719,9 +52716,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s7, s90, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v15, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -52734,28 +52731,28 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s88, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s79, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v38 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_and_b32 s4, s23, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s78, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s77, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s76, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -52770,9 +52767,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s7, s73, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -52787,9 +52784,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s7, s62, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -52805,7 +52802,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s60, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s59, 8 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v19 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -52822,9 +52819,9 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s5, s57, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s56, 8 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58024,51 +58021,51 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 -; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v19 -; VI-NEXT: v_add_u16_sdwa v19, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 3, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v19 -; VI-NEXT: v_add_u16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v20 +; VI-NEXT: v_add_u16_sdwa v20, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v21 ; VI-NEXT: v_add_u16_e32 v13, 3, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v19 -; VI-NEXT: v_add_u16_sdwa v19, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 3, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v19 -; VI-NEXT: v_add_u16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v20 +; VI-NEXT: v_add_u16_sdwa v20, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v21 ; VI-NEXT: v_add_u16_e32 v11, 3, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v19 -; VI-NEXT: v_add_u16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 3, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_add_u16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v20 +; VI-NEXT: v_add_u16_sdwa v20, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v21 ; VI-NEXT: v_add_u16_e32 v9, 3, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_add_u16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v19 -; VI-NEXT: v_add_u16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v20 +; VI-NEXT: v_add_u16_sdwa v20, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v21 ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v19 -; VI-NEXT: v_add_u16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v19 -; VI-NEXT: v_add_u16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v19 -; VI-NEXT: v_add_u16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_add_u16_sdwa v17, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v19 ; VI-NEXT: v_add_u16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v16, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v20 +; VI-NEXT: v_add_u16_sdwa v20, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_sdwa v21, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v3, v3, v16 +; VI-NEXT: v_or_b32_e32 v5, v5, v20 +; VI-NEXT: v_or_b32_e32 v4, v4, v16 +; VI-NEXT: v_or_b32_e32 v3, v3, v21 ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 @@ -58541,8 +58538,8 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -58575,45 +58572,32 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -58622,12 +58606,12 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -58636,22 +58620,28 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -58660,9 +58650,10 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -58678,10 +58669,11 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -58690,11 +58682,17 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -58702,7 +58700,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v32, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -58710,7 +58708,6 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -58718,7 +58715,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -58733,51 +58730,51 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v15 ; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v19, v15 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v14 ; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v19, v14 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v15, v20, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v14, v21, v14 ; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v19, v13 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v12 ; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v19, v12 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v13, v20, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v12, v21, v12 ; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 ; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 ; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 ; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v19, v7 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v6 ; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v19, v6 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v19, v5 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v19, v4 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v7, v20, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v6, v21, v6 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v5, v20, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_or_b32_e32 v3, v21, v3 ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 @@ -58865,33 +58862,27 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v18 ; SI-NEXT: v_mov_b32_e32 v31, v17 ; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 @@ -58904,42 +58895,48 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -58948,12 +58945,12 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -58962,22 +58959,28 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -58986,9 +58989,10 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -59004,10 +59008,11 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -59016,11 +59021,17 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -59028,7 +59039,7 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v32, 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -59036,7 +59047,6 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -59044,7 +59054,7 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 ; SI-NEXT: .LBB91_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: @@ -59077,51 +59087,51 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB91_3 ; VI-NEXT: .LBB91_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v15 ; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v19, v15 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v14 ; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v19, v14 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v15, v20, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v14, v21, v14 ; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v19, v13 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v12 ; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v19, v12 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v13, v20, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v12, v21, v12 ; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 ; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 ; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 ; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v19, v7 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v6 ; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v19, v6 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v19, v5 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v19, v4 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v7, v20, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v6, v21, v6 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v5, v20, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_or_b32_e32 v3, v21, v3 ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 @@ -59242,13 +59252,11 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v49, v30 ; SI-NEXT: v_mov_b32_e32 v55, v28 ; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v49, v30 ; SI-NEXT: v_mov_b32_e32 v53, v24 ; SI-NEXT: v_mov_b32_e32 v52, v22 ; SI-NEXT: v_mov_b32_e32 v51, v20 @@ -59260,6 +59268,8 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v36, v8 ; SI-NEXT: v_mov_b32_e32 v35, v6 ; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -59275,6 +59285,8 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -59286,15 +59298,13 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB92_3 @@ -59450,51 +59460,51 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 -; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v19 -; VI-NEXT: v_add_u16_sdwa v19, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 3, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v19 -; VI-NEXT: v_add_u16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v20 +; VI-NEXT: v_add_u16_sdwa v20, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v21 ; VI-NEXT: v_add_u16_e32 v13, 3, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v19 -; VI-NEXT: v_add_u16_sdwa v19, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 3, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v19 -; VI-NEXT: v_add_u16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v20 +; VI-NEXT: v_add_u16_sdwa v20, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v21 ; VI-NEXT: v_add_u16_e32 v11, 3, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v19 -; VI-NEXT: v_add_u16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 3, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_add_u16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v20 +; VI-NEXT: v_add_u16_sdwa v20, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v21 ; VI-NEXT: v_add_u16_e32 v9, 3, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_add_u16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v19 -; VI-NEXT: v_add_u16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v20 +; VI-NEXT: v_add_u16_sdwa v20, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v21 ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v19 -; VI-NEXT: v_add_u16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v19 -; VI-NEXT: v_add_u16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v19 -; VI-NEXT: v_add_u16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_add_u16_sdwa v17, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v19 ; VI-NEXT: v_add_u16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v16, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v20 +; VI-NEXT: v_add_u16_sdwa v20, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_sdwa v21, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_e32 v3, v3, v16 +; VI-NEXT: v_or_b32_e32 v5, v5, v20 +; VI-NEXT: v_or_b32_e32 v4, v4, v16 +; VI-NEXT: v_or_b32_e32 v3, v3, v21 ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 @@ -59598,20 +59608,20 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v20 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s14, s16, 16 -; SI-NEXT: s_lshl_b32 s15, s17, 16 -; SI-NEXT: s_lshl_b32 s40, s18, 16 -; SI-NEXT: s_lshl_b32 s41, s19, 16 -; SI-NEXT: s_lshl_b32 s42, s20, 16 -; SI-NEXT: s_lshl_b32 s43, s21, 16 -; SI-NEXT: s_lshl_b32 s6, s22, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_lshl_b32 s8, s24, 16 -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_lshl_b32 s10, s26, 16 -; SI-NEXT: s_lshl_b32 s11, s27, 16 -; SI-NEXT: s_lshl_b32 s12, s28, 16 -; SI-NEXT: s_lshl_b32 s13, s29, 16 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_lshl_b32 s14, s17, 16 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_lshl_b32 s40, s19, 16 +; SI-NEXT: s_lshl_b32 s41, s20, 16 +; SI-NEXT: s_lshl_b32 s42, s21, 16 +; SI-NEXT: s_lshl_b32 s43, s22, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_lshl_b32 s7, s24, 16 +; SI-NEXT: s_lshl_b32 s8, s25, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_lshl_b32 s11, s28, 16 +; SI-NEXT: s_lshl_b32 s12, s29, 16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 @@ -59636,12 +59646,13 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s7, s25, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s6, 0x30000 ; SI-NEXT: s_and_b32 s6, s22, 0xffff -; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xffff +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s9, s21, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 @@ -59653,7 +59664,7 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_or_b32 s6, s9, s6 ; SI-NEXT: s_and_b32 s9, s18, 0xffff ; SI-NEXT: s_lshl_b32 s10, s19, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -59691,23 +59702,22 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s10, 16 -; SI-NEXT: s_and_b32 s41, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s9, 16 -; SI-NEXT: s_and_b32 s43, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s7, 16 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s5, 16 -; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s4, 16 +; SI-NEXT: s_and_b32 s14, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s10, 16 +; SI-NEXT: s_and_b32 s40, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s9, 16 +; SI-NEXT: s_and_b32 s42, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s6, 16 +; SI-NEXT: s_and_b32 s6, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s8, 16 +; SI-NEXT: s_and_b32 s8, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s5, 16 +; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s4, 16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 @@ -59727,22 +59737,23 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_mov_b32_e32 v2, s40 -; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: v_mov_b32_e32 v4, s42 -; SI-NEXT: v_mov_b32_e32 v5, s43 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_mov_b32_e32 v9, s9 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: v_mov_b32_e32 v11, s11 -; SI-NEXT: v_mov_b32_e32 v12, s12 -; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v4, s41 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s43 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v13, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr40 @@ -59756,7 +59767,6 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -60288,300 +60298,300 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 -; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v20, v19 +; VI-NEXT: s_movk_i32 s8, 0x7fff +; VI-NEXT: v_add_u32_e32 v1, vcc, s8, v1 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 -; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v22, v21 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, s8, v16 +; VI-NEXT: v_bfe_u32 v23, v17, 16, 1 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v21 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v27, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v23, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cndmask_b32_e32 v17, v3, v30, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v29, v28 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v18, v4, v18, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v34 +; VI-NEXT: v_bfe_u32 v22, v20, 16, 1 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v19, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v22, v20 +; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v5 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 -; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 -; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_cndmask_b32_e32 v19, v5, v21, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v24, v23 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 -; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v5 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 -; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v27, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v20, v6, v29, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v21, v6, v32, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v33, v31 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_bfe_u32 v35, v22, 16, 1 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v34, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v35, v22 +; VI-NEXT: v_add_u32_e32 v7, vcc, s8, v7 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v22 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v36, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v24, v23 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 -; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_add_u32_e32 v7, vcc, s8, v7 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v26, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v8, vcc, s8, v8 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v23, v8, v29, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v8, vcc, s8, v8 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v31, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, v33, v32 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, s8, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v34, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v9, vcc, s8, v9 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v26, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v10, vcc, s8, v10 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 -; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_cndmask_b32_e32 v25, v10, v29, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v10, vcc, s8, v10 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v31, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, v33, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_add_u32_e32 v30, vcc, s8, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 -; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v30, v30, v34, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, s8, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v26 ; VI-NEXT: v_bfe_u32 v28, v11, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v11 -; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_u32_e32 v28, vcc, s8, v28 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v11 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v33, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, s8, v31 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v32 ; VI-NEXT: v_bfe_u32 v29, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc ; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v12 -; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; VI-NEXT: v_add_u32_e32 v29, vcc, s8, v29 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc -; VI-NEXT: v_bfe_u32 v30, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v13 -; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 -; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; VI-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v33, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_add_u32_e32 v29, vcc, s8, v29 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v32 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v31, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s8, v13 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v33, vcc ; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 -; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v31 +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; VI-NEXT: v_bfe_u32 v32, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v15 -; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e64 v15, s[4:5], v15, v14 +; VI-NEXT: v_add_u32_e64 v15, s[4:5], s8, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v14, v14 +; VI-NEXT: v_cndmask_b32_e64 v14, v15, v34, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; VI-NEXT: v_alignbit_b32 v15, v14, v15, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_alignbit_b32 v14, v14, v12, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 -; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v13, v12, v29, 16 +; VI-NEXT: v_alignbit_b32 v12, v11, v26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v6, v22, v6, 16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 -; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 -; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v10, v26, v10, 16 +; VI-NEXT: v_alignbit_b32 v9, v25, v9, 16 +; VI-NEXT: v_alignbit_b32 v8, v24, v8, 16 +; VI-NEXT: v_alignbit_b32 v7, v23, v7, 16 +; VI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v22, v0, 16 ; VI-NEXT: .LBB94_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v32i16: @@ -60594,247 +60604,247 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v0 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v16 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v19, v19, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v19, v19, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v20, v20, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v21, v21, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc +; GFX9-NEXT: v_add3_u32 v16, v19, v18, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v20 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v16, v24, vcc +; GFX9-NEXT: v_add3_u32 v18, v21, v20, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v21, v21, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX9-NEXT: v_add3_u32 v17, v23, v22, s6 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v25 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v2, vcc +; GFX9-NEXT: v_add3_u32 v2, v26, v25, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v3 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v30, vcc +; GFX9-NEXT: v_add3_u32 v4, v28, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v29 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v32, vcc +; GFX9-NEXT: v_add3_u32 v4, v31, v29, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v24, v21, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v34, v33, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v21 +; GFX9-NEXT: v_bfe_u32 v35, v23, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v4, v19, vcc +; GFX9-NEXT: v_add3_u32 v4, v24, v21, s6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v22, v22, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v23 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc +; GFX9-NEXT: v_add3_u32 v5, v35, v23, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v22, vcc +; GFX9-NEXT: v_add3_u32 v5, v26, v25, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v28, vcc +; GFX9-NEXT: v_add3_u32 v6, v30, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v6, v31, vcc +; GFX9-NEXT: v_add3_u32 v6, v32, v29, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v24 +; GFX9-NEXT: v_bfe_u32 v35, v23, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v33, vcc +; GFX9-NEXT: v_add3_u32 v7, v34, v24, s6 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; GFX9-NEXT: v_bfe_u32 v37, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v7, v36, vcc +; GFX9-NEXT: v_add3_u32 v7, v35, v23, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v26, vcc +; GFX9-NEXT: v_add3_u32 v8, v37, v25, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v8, v28, vcc +; GFX9-NEXT: v_add3_u32 v8, v30, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v31 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v26, v26, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v29, vcc +; GFX9-NEXT: v_add3_u32 v29, v32, v31, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v29, v24, vcc +; GFX9-NEXT: v_add3_u32 v29, v34, v33, s6 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v9 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v27, v27, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v29, v26, vcc +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v10, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc +; GFX9-NEXT: v_add3_u32 v28, v30, v27, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX9-NEXT: v_bfe_u32 v29, v9, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc +; GFX9-NEXT: v_add3_u32 v31, v32, v10, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v9 +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v31, v33, vcc +; GFX9-NEXT: v_add3_u32 v29, v29, v9, s6 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v29, v30, vcc +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s6 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v29, v29, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v29, v29, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v27, v32, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v10, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v10 +; GFX9-NEXT: v_bfe_u32 v30, v12, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v33 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc +; GFX9-NEXT: v_add3_u32 v30, v30, v12, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v30, v30, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc -; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v30, v30, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v10 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v30, v11, vcc +; GFX9-NEXT: v_add3_u32 v30, v33, v10, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v31, v31, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v30, v32, vcc +; GFX9-NEXT: v_add3_u32 v13, v13, v12, s6 +; GFX9-NEXT: v_bfe_u32 v32, v10, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v33, vcc +; GFX9-NEXT: v_add3_u32 v32, v32, v10, s6 +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v14, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v32, v32, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v31, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v30, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v29, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v28, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v27, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v26, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v25, s6 -; GFX9-NEXT: v_perm_b32 v8, v8, v24, s6 -; GFX9-NEXT: v_perm_b32 v7, v7, v23, s6 -; GFX9-NEXT: v_perm_b32 v6, v6, v22, s6 -; GFX9-NEXT: v_perm_b32 v5, v5, v21, s6 -; GFX9-NEXT: v_perm_b32 v4, v4, v20, s6 -; GFX9-NEXT: v_perm_b32 v3, v3, v19, s6 -; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 -; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 -; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; GFX9-NEXT: v_perm_b32 v15, v14, v32, s6 +; GFX9-NEXT: v_perm_b32 v14, v10, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s6 +; GFX9-NEXT: v_perm_b32 v12, v11, v9, s6 +; GFX9-NEXT: v_perm_b32 v11, v27, v29, s6 +; GFX9-NEXT: v_perm_b32 v10, v31, v28, s6 +; GFX9-NEXT: v_perm_b32 v9, v25, v26, s6 +; GFX9-NEXT: v_perm_b32 v8, v24, v8, s6 +; GFX9-NEXT: v_perm_b32 v7, v23, v7, s6 +; GFX9-NEXT: v_perm_b32 v6, v22, v6, s6 +; GFX9-NEXT: v_perm_b32 v5, v21, v5, s6 +; GFX9-NEXT: v_perm_b32 v4, v20, v4, s6 +; GFX9-NEXT: v_perm_b32 v3, v19, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v18, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v17, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v16, v0, s6 ; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -61446,10 +61456,11 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 @@ -61468,8 +61479,7 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 @@ -61485,8 +61495,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 @@ -61613,9 +61623,9 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -61687,11 +61697,11 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v21, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v21, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -61700,294 +61710,294 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v15, v6, v5, 16 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_bfe_u32 v2, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_alignbit_b32 v14, v2, v3, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v5 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v13, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v11, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v9, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v7, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v3 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v16, v17, 16, 1 ; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v17 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v16, v18, 16, 1 ; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v18 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_alignbit_b32 v2, v17, v2, 16 +; VI-NEXT: v_bfe_u32 v17, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_f32_e32 v0, s6, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_cndmask_b32_e64 v1, v17, v19, s[4:5] +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v18, v16, 16 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -62009,10 +62019,10 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v21, 1 +; VI-NEXT: v_readlane_b32 s30, v21, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -62021,11 +62031,11 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -62034,295 +62044,295 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: s_cbranch_execnz .LBB95_4 ; GFX9-NEXT: .LBB95_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_and_or_b32 v15, v4, v16, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_and_or_b32 v14, v2, v16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17 ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v17, v18 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX9-NEXT: v_and_or_b32 v1, v17, v16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v19, v16, v18 ; GFX9-NEXT: s_branch .LBB95_5 ; GFX9-NEXT: .LBB95_3: ; GFX9-NEXT: s_branch .LBB95_2 @@ -62344,10 +62354,10 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB95_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -64909,269 +64919,269 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v37, s30, 0 ; SI-NEXT: v_writelane_b32 v37, s31, 1 -; SI-NEXT: v_writelane_b32 v37, s34, 2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v37, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s34, v18 -; SI-NEXT: v_readfirstlane_b32 s35, v17 -; SI-NEXT: v_readfirstlane_b32 s30, v14 -; SI-NEXT: v_readfirstlane_b32 s31, v13 -; SI-NEXT: v_readfirstlane_b32 s94, v10 -; SI-NEXT: v_readfirstlane_b32 s95, v9 -; SI-NEXT: v_readfirstlane_b32 s92, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v5 -; SI-NEXT: v_readfirstlane_b32 s90, v2 -; SI-NEXT: v_readfirstlane_b32 s91, v1 +; SI-NEXT: v_writelane_b32 v37, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s79, v18 +; SI-NEXT: v_readfirstlane_b32 s89, v17 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s77, v13 +; SI-NEXT: v_readfirstlane_b32 s72, v10 +; SI-NEXT: v_readfirstlane_b32 s74, v9 +; SI-NEXT: v_readfirstlane_b32 s60, v6 +; SI-NEXT: v_readfirstlane_b32 s62, v5 +; SI-NEXT: v_readfirstlane_b32 s57, v2 +; SI-NEXT: v_readfirstlane_b32 s58, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_writelane_b32 v37, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_or_b32 s42, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_or_b32 s15, s4, s5 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_or_b32 s14, s4, s5 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: v_mov_b32_e32 v1, s40 -; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s42 +; SI-NEXT: s_or_b32 s13, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: v_alignbit_b32 v18, s41, v1, 24 -; SI-NEXT: v_alignbit_b32 v25, s41, v1, 16 -; SI-NEXT: v_alignbit_b32 v30, s41, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: v_alignbit_b32 v16, s41, v1, 24 +; SI-NEXT: v_alignbit_b32 v21, s41, v1, 16 +; SI-NEXT: v_alignbit_b32 v25, s41, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: v_alignbit_b32 v19, s15, v1, 24 -; SI-NEXT: v_alignbit_b32 v26, s15, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, s15, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s91, 0xffff -; SI-NEXT: s_lshl_b32 s5, s90, 16 -; SI-NEXT: v_alignbit_b32 v17, s13, v1, 24 -; SI-NEXT: v_alignbit_b32 v23, s13, v1, 16 -; SI-NEXT: v_alignbit_b32 v29, s13, v1, 8 +; SI-NEXT: v_alignbit_b32 v17, s14, v1, 24 +; SI-NEXT: v_alignbit_b32 v22, s14, v1, 16 +; SI-NEXT: v_alignbit_b32 v26, s14, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v16, s11, v1, 24 -; SI-NEXT: v_alignbit_b32 v20, s11, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, s11, v1, 8 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: v_alignbit_b32 v14, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v20, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, s12, v1, 8 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_and_b32 s4, s62, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: v_alignbit_b32 v13, s10, v1, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v1, 16 +; SI-NEXT: v_alignbit_b32 v23, s10, v1, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s93, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 -; SI-NEXT: v_or_b32_e32 v5, v1, v33 ; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_or_b32_e32 v5, v1, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s94, 16 -; SI-NEXT: v_or_b32_e32 v4, v1, v34 ; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: v_or_b32_e32 v2, v1, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: s_and_b32 s4, s31, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 -; SI-NEXT: v_or_b32_e32 v2, v1, v35 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_and_b32 s4, s35, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: s_and_b32 s4, s89, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 ; SI-NEXT: s_or_b32 s6, s4, s5 ; SI-NEXT: v_alignbit_b32 v9, s9, v5, 24 -; SI-NEXT: v_alignbit_b32 v12, s9, v5, 16 -; SI-NEXT: v_alignbit_b32 v21, s9, v5, 8 -; SI-NEXT: v_alignbit_b32 v6, s8, v4, 24 -; SI-NEXT: v_alignbit_b32 v8, s8, v4, 16 -; SI-NEXT: v_alignbit_b32 v13, s8, v4, 8 -; SI-NEXT: v_alignbit_b32 v24, s7, v2, 24 -; SI-NEXT: v_alignbit_b32 v28, s7, v2, 16 -; SI-NEXT: v_alignbit_b32 v32, s7, v2, 8 -; SI-NEXT: v_alignbit_b32 v10, s6, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v22, s6, v1, 8 -; SI-NEXT: s_lshr_b32 s78, s41, 8 -; SI-NEXT: s_lshr_b32 s75, s15, 8 -; SI-NEXT: s_lshr_b32 s72, s13, 8 -; SI-NEXT: s_lshr_b32 s61, s11, 8 -; SI-NEXT: s_lshr_b32 s58, s9, 8 -; SI-NEXT: s_lshr_b32 s47, s8, 8 +; SI-NEXT: v_alignbit_b32 v10, s9, v5, 16 +; SI-NEXT: v_alignbit_b32 v19, s9, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, s8, v2, 24 +; SI-NEXT: v_alignbit_b32 v8, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v12, s8, v2, 8 +; SI-NEXT: v_alignbit_b32 v28, s7, v1, 24 +; SI-NEXT: v_alignbit_b32 v29, s7, v1, 16 +; SI-NEXT: v_alignbit_b32 v30, s7, v1, 8 +; SI-NEXT: v_alignbit_b32 v31, s6, v4, 24 +; SI-NEXT: v_alignbit_b32 v32, s6, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, s6, v4, 8 +; SI-NEXT: s_lshr_b32 s31, s41, 8 +; SI-NEXT: s_lshr_b32 s94, s14, 8 +; SI-NEXT: s_lshr_b32 s91, s12, 8 +; SI-NEXT: s_lshr_b32 s78, s10, 8 +; SI-NEXT: s_lshr_b32 s63, s9, 8 +; SI-NEXT: s_lshr_b32 s56, s8, 8 ; SI-NEXT: s_lshr_b32 s45, s7, 8 -; SI-NEXT: s_lshr_b32 s42, s6, 8 -; SI-NEXT: s_and_b32 s88, s19, 0xffff -; SI-NEXT: s_and_b32 s77, s23, 0xffff -; SI-NEXT: s_and_b32 s74, s27, 0xffff -; SI-NEXT: s_and_b32 s63, s90, 0xffff -; SI-NEXT: s_and_b32 s60, s92, 0xffff -; SI-NEXT: s_and_b32 s57, s94, 0xffff -; SI-NEXT: s_and_b32 s46, s30, 0xffff -; SI-NEXT: s_and_b32 s43, s34, 0xffff -; SI-NEXT: s_bfe_u32 s89, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s79, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s76, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s73, s90, 0x80008 -; SI-NEXT: s_bfe_u32 s62, s92, 0x80008 -; SI-NEXT: s_bfe_u32 s59, s94, 0x80008 -; SI-NEXT: s_bfe_u32 s56, s30, 0x80008 -; SI-NEXT: s_bfe_u32 s44, s34, 0x80008 +; SI-NEXT: s_lshr_b32 s40, s6, 8 +; SI-NEXT: s_and_b32 s34, s19, 0xffff +; SI-NEXT: s_and_b32 s95, s23, 0xffff +; SI-NEXT: s_and_b32 s92, s27, 0xffff +; SI-NEXT: s_and_b32 s88, s57, 0xffff +; SI-NEXT: s_and_b32 s73, s60, 0xffff +; SI-NEXT: s_and_b32 s59, s72, 0xffff +; SI-NEXT: s_and_b32 s46, s75, 0xffff +; SI-NEXT: s_and_b32 s43, s79, 0xffff +; SI-NEXT: s_bfe_u32 s35, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s30, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s93, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s90, s57, 0x80008 +; SI-NEXT: s_bfe_u32 s76, s60, 0x80008 +; SI-NEXT: s_bfe_u32 s61, s72, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s75, 0x80008 +; SI-NEXT: s_bfe_u32 s44, s79, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: s_and_b32 s4, s35, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_add_i32 s89, s89, 3 +; SI-NEXT: s_and_b32 s4, s89, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s77, s77, 3 ; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s31, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_add_i32 s74, s74, 3 ; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_add_i32 s62, s62, 3 ; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s93, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_and_b32 s4, s62, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s9, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s91, s91, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s91, 0xffff -; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_add_i32 s41, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v6, s42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_alignbit_b32 v18, s41, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, s41, v6, 16 -; SI-NEXT: v_alignbit_b32 v30, s41, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_alignbit_b32 v16, s41, v6, 24 +; SI-NEXT: v_alignbit_b32 v21, s41, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, s41, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v33, v3 -; SI-NEXT: v_alignbit_b32 v19, s15, v6, 24 -; SI-NEXT: v_alignbit_b32 v26, s15, v6, 16 -; SI-NEXT: v_alignbit_b32 v31, s15, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_alignbit_b32 v17, s14, v6, 24 +; SI-NEXT: v_alignbit_b32 v22, s14, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, s14, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s13 ; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v11, s7 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; SI-NEXT: v_mov_b32_e32 v10, s7 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 ; SI-NEXT: v_mov_b32_e32 v7, s8 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v3 ; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_alignbit_b32 v17, s13, v6, 24 -; SI-NEXT: v_alignbit_b32 v23, s13, v6, 16 -; SI-NEXT: v_alignbit_b32 v29, s13, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_alignbit_b32 v16, s11, v6, 24 -; SI-NEXT: v_alignbit_b32 v20, s11, v6, 16 -; SI-NEXT: v_alignbit_b32 v27, s11, v6, 8 +; SI-NEXT: v_alignbit_b32 v14, s12, v6, 24 +; SI-NEXT: v_alignbit_b32 v20, s12, v6, 16 +; SI-NEXT: v_alignbit_b32 v24, s12, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_alignbit_b32 v13, s10, v6, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v6, 16 +; SI-NEXT: v_alignbit_b32 v23, s10, v6, 8 ; SI-NEXT: v_alignbit_b32 v9, v3, v5, 24 -; SI-NEXT: v_alignbit_b32 v12, v3, v5, 16 -; SI-NEXT: v_alignbit_b32 v21, v3, v5, 8 -; SI-NEXT: v_alignbit_b32 v6, v7, v4, 24 -; SI-NEXT: v_alignbit_b32 v8, v7, v4, 16 -; SI-NEXT: v_alignbit_b32 v13, v7, v4, 8 -; SI-NEXT: v_alignbit_b32 v24, v10, v2, 24 -; SI-NEXT: v_alignbit_b32 v28, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v2, 8 -; SI-NEXT: v_alignbit_b32 v10, v15, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, v15, v1, 16 -; SI-NEXT: v_alignbit_b32 v22, v15, v1, 8 -; SI-NEXT: s_lshr_b32 s89, s41, 24 -; SI-NEXT: s_lshr_b32 s88, s41, 16 -; SI-NEXT: s_lshr_b32 s78, s41, 8 -; SI-NEXT: s_lshr_b32 s79, s15, 24 -; SI-NEXT: s_lshr_b32 s77, s15, 16 -; SI-NEXT: s_lshr_b32 s75, s15, 8 -; SI-NEXT: s_lshr_b32 s76, s13, 24 -; SI-NEXT: s_lshr_b32 s74, s13, 16 -; SI-NEXT: s_lshr_b32 s72, s13, 8 -; SI-NEXT: s_lshr_b32 s73, s11, 24 -; SI-NEXT: s_lshr_b32 s63, s11, 16 -; SI-NEXT: s_lshr_b32 s61, s11, 8 -; SI-NEXT: s_lshr_b32 s62, s9, 24 -; SI-NEXT: s_lshr_b32 s60, s9, 16 -; SI-NEXT: s_lshr_b32 s58, s9, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 8 -; SI-NEXT: s_lshr_b32 s56, s7, 24 +; SI-NEXT: v_alignbit_b32 v10, v3, v5, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 24 +; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 8 +; SI-NEXT: v_alignbit_b32 v28, v11, v1, 24 +; SI-NEXT: v_alignbit_b32 v29, v11, v1, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v1, 8 +; SI-NEXT: v_alignbit_b32 v31, v15, v4, 24 +; SI-NEXT: v_alignbit_b32 v32, v15, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v15, v4, 8 +; SI-NEXT: s_lshr_b32 s35, s41, 24 +; SI-NEXT: s_lshr_b32 s34, s41, 16 +; SI-NEXT: s_lshr_b32 s31, s41, 8 +; SI-NEXT: s_lshr_b32 s30, s14, 24 +; SI-NEXT: s_lshr_b32 s95, s14, 16 +; SI-NEXT: s_lshr_b32 s94, s14, 8 +; SI-NEXT: s_lshr_b32 s93, s12, 24 +; SI-NEXT: s_lshr_b32 s92, s12, 16 +; SI-NEXT: s_lshr_b32 s91, s12, 8 +; SI-NEXT: s_lshr_b32 s90, s10, 24 +; SI-NEXT: s_lshr_b32 s88, s10, 16 +; SI-NEXT: s_lshr_b32 s78, s10, 8 +; SI-NEXT: s_lshr_b32 s76, s9, 24 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s9, 8 +; SI-NEXT: s_lshr_b32 s61, s8, 24 +; SI-NEXT: s_lshr_b32 s59, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s8, 8 +; SI-NEXT: s_lshr_b32 s47, s7, 24 ; SI-NEXT: s_lshr_b32 s46, s7, 16 ; SI-NEXT: s_lshr_b32 s45, s7, 8 ; SI-NEXT: s_lshr_b32 s44, s6, 24 ; SI-NEXT: s_lshr_b32 s43, s6, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: s_lshr_b32 s40, s6, 8 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v30 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v25 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: s_lshl_b32 s5, s31, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_and_b32 s5, s34, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v16 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s89, 24 +; SI-NEXT: s_lshl_b32 s16, s35, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -65183,19 +65193,19 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v26 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_and_b32 s5, s95, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v17 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s79, 24 +; SI-NEXT: s_lshl_b32 s14, s30, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -65208,19 +65218,19 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v24 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v14 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s76, 24 +; SI-NEXT: s_lshl_b32 s12, s93, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -65233,19 +65243,19 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: s_and_b32 s5, s88, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v13 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s73, 24 +; SI-NEXT: s_lshl_b32 s10, s90, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -65259,18 +65269,18 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v19 ; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s5, s73, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s62, 24 +; SI-NEXT: s_lshl_b32 s9, s76, 24 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -65283,67 +65293,67 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s59, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_lshl_b32 s8, s61, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 ; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s5, s45, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v29 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s56, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_lshl_b32 s7, s47, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 ; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s43, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v31 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s44, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -65368,70 +65378,70 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -66011,32 +66021,32 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] ; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v14, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v19, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -66065,16 +66075,16 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v18 ; GFX9-NEXT: s_branch .LBB97_5 ; GFX9-NEXT: .LBB97_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -66127,34 +66137,34 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB97_2 ; GFX9-NEXT: .LBB97_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s44 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 ; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 ; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 ; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 ; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 ; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v16, s55 +; GFX9-NEXT: v_mov_b32_e32 v61, s53 +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: v_mov_b32_e32 v59, s52 +; GFX9-NEXT: v_mov_b32_e32 v60, s51 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 -; GFX9-NEXT: v_mov_b32_e32 v60, s52 -; GFX9-NEXT: v_mov_b32_e32 v61, s51 ; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v62, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -66186,45 +66196,45 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v22, s12 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s8 +; GFX9-NEXT: v_mov_b32_e32 v25, s6 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 ; GFX9-NEXT: .LBB97_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v18, v61, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v59, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v15, v62, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -66236,7 +66246,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -66248,7 +66258,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -66260,7 +66270,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -66312,8 +66322,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -71534,51 +71544,51 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v15 ; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v19, v15 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v14 ; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v19, v14 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v15, v20, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v14, v21, v14 ; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v19, v13 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v12 ; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v19, v12 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v13, v20, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v12, v21, v12 ; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 ; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 ; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 ; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v19, v7 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v6 ; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v19, v6 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v19, v5 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v19, v4 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v7, v20, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v6, v21, v6 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v5, v20, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_or_b32_e32 v3, v21, v3 ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 @@ -71684,6 +71694,7 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v32, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 @@ -71709,7 +71720,6 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 @@ -71968,51 +71978,51 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; VI-NEXT: s_cbranch_execnz .LBB101_3 ; VI-NEXT: .LBB101_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v15 ; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v19, v15 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v14 ; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v19, v14 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v15, v20, v15 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v14, v21, v14 ; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v19, v13 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v12 ; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v19, v12 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v13, v20, v13 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v12, v21, v12 ; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v19, v11 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v10 ; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v19, v10 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v11, v20, v11 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v10, v21, v10 ; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v19, v9 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v8 ; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v19, v8 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v9, v20, v9 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v8, v21, v8 ; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v19, v7 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v6 ; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v19, v6 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v19, v5 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 -; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v19, v4 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v7, v20, v7 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v6, v21, v6 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v5, v20, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_or_b32_e32 v3, v21, v3 ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 @@ -72478,300 +72488,300 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 -; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v20, v19 +; VI-NEXT: s_movk_i32 s8, 0x7fff +; VI-NEXT: v_add_u32_e32 v1, vcc, s8, v1 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 -; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v22, v21 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, s8, v16 +; VI-NEXT: v_bfe_u32 v23, v17, 16, 1 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v21 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v27, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v23, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cndmask_b32_e32 v17, v3, v30, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v29, v28 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s8, v3 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v18, v4, v18, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v34 +; VI-NEXT: v_bfe_u32 v22, v20, 16, 1 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v19, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v22, v20 +; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v5 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 -; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 -; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_cndmask_b32_e32 v19, v5, v21, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v24, v23 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 -; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v5 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 -; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v27, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v20, v6, v29, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v21, v6, v32, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v33, v31 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_bfe_u32 v35, v22, 16, 1 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v34, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v35, v22 +; VI-NEXT: v_add_u32_e32 v7, vcc, s8, v7 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v22 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v36, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v24, v23 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 -; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_add_u32_e32 v7, vcc, s8, v7 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v26, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v8, vcc, s8, v8 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v23, v8, v29, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v8, vcc, s8, v8 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v31, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, v33, v32 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, s8, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v34, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v9, vcc, s8, v9 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v26, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v10, vcc, s8, v10 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_bfe_u32 v30, v28, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 -; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_cndmask_b32_e32 v25, v10, v29, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, v30, v28 +; VI-NEXT: v_add_u32_e32 v10, vcc, s8, v10 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v31, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, v33, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_add_u32_e32 v30, vcc, s8, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 -; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v30, v30, v34, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, s8, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v26 ; VI-NEXT: v_bfe_u32 v28, v11, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v11 -; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_add_u32_e32 v28, vcc, s8, v28 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v11 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v33, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, s8, v31 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v32 ; VI-NEXT: v_bfe_u32 v29, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc ; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v12 -; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; VI-NEXT: v_add_u32_e32 v29, vcc, s8, v29 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc -; VI-NEXT: v_bfe_u32 v30, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v13 -; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 -; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; VI-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v33, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_add_u32_e32 v29, vcc, s8, v29 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v32 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v31, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s8, v13 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v33, vcc ; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 -; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v31 +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; VI-NEXT: v_bfe_u32 v32, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v15 -; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s8, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e64 v15, s[4:5], v15, v14 +; VI-NEXT: v_add_u32_e64 v15, s[4:5], s8, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v14, v14 +; VI-NEXT: v_cndmask_b32_e64 v14, v15, v34, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; VI-NEXT: v_alignbit_b32 v15, v14, v15, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_alignbit_b32 v14, v14, v12, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 -; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v13, v12, v29, 16 +; VI-NEXT: v_alignbit_b32 v12, v11, v26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v6, v22, v6, 16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 -; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 -; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_alignbit_b32 v10, v26, v10, 16 +; VI-NEXT: v_alignbit_b32 v9, v25, v9, 16 +; VI-NEXT: v_alignbit_b32 v8, v24, v8, 16 +; VI-NEXT: v_alignbit_b32 v7, v23, v7, 16 +; VI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_alignbit_b32 v0, v22, v0, 16 ; VI-NEXT: .LBB102_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v32f16: @@ -72784,247 +72794,247 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v0 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v16 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v19, v19, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v19, v19, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v20, v20, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v21, v21, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc +; GFX9-NEXT: v_add3_u32 v16, v19, v18, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v20 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v16, v24, vcc +; GFX9-NEXT: v_add3_u32 v18, v21, v20, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v21, v21, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX9-NEXT: v_add3_u32 v17, v23, v22, s6 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v25 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v2, vcc +; GFX9-NEXT: v_add3_u32 v2, v26, v25, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v3 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v30, vcc +; GFX9-NEXT: v_add3_u32 v4, v28, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v29 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v32, vcc +; GFX9-NEXT: v_add3_u32 v4, v31, v29, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v24, v21, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v34, v33, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v21 +; GFX9-NEXT: v_bfe_u32 v35, v23, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v4, v19, vcc +; GFX9-NEXT: v_add3_u32 v4, v24, v21, s6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v22, v22, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v23 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc +; GFX9-NEXT: v_add3_u32 v5, v35, v23, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v22, vcc +; GFX9-NEXT: v_add3_u32 v5, v26, v25, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v28, vcc +; GFX9-NEXT: v_add3_u32 v6, v30, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v6, v31, vcc +; GFX9-NEXT: v_add3_u32 v6, v32, v29, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v24 +; GFX9-NEXT: v_bfe_u32 v35, v23, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v33, vcc +; GFX9-NEXT: v_add3_u32 v7, v34, v24, s6 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; GFX9-NEXT: v_bfe_u32 v37, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v7, v36, vcc +; GFX9-NEXT: v_add3_u32 v7, v35, v23, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v26, vcc +; GFX9-NEXT: v_add3_u32 v8, v37, v25, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v8, v28, vcc +; GFX9-NEXT: v_add3_u32 v8, v30, v27, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v31 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v26, v26, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v29, vcc +; GFX9-NEXT: v_add3_u32 v29, v32, v31, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v29, v24, vcc +; GFX9-NEXT: v_add3_u32 v29, v34, v33, s6 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v9 +; GFX9-NEXT: v_bfe_u32 v30, v27, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v27, v27, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v29, v26, vcc +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX9-NEXT: v_bfe_u32 v32, v10, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc +; GFX9-NEXT: v_add3_u32 v28, v30, v27, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX9-NEXT: v_bfe_u32 v29, v9, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc +; GFX9-NEXT: v_add3_u32 v31, v32, v10, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v9 +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v31, v33, vcc +; GFX9-NEXT: v_add3_u32 v29, v29, v9, s6 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v29, v30, vcc +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s6 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v29, v29, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v29, v29, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v27, v32, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v10, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v10 +; GFX9-NEXT: v_bfe_u32 v30, v12, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v33 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v12 +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc +; GFX9-NEXT: v_add3_u32 v30, v30, v12, s6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v30, v30, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc -; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v30, v30, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v10 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v30, v11, vcc +; GFX9-NEXT: v_add3_u32 v30, v33, v10, s6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v31, v31, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v30, v32, vcc +; GFX9-NEXT: v_add3_u32 v13, v13, v12, s6 +; GFX9-NEXT: v_bfe_u32 v32, v10, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v33, vcc +; GFX9-NEXT: v_add3_u32 v32, v32, v10, s6 +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v14, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc +; GFX9-NEXT: v_add3_u32 v32, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v32, v32, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v31, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v30, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v29, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v28, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v27, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v26, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v25, s6 -; GFX9-NEXT: v_perm_b32 v8, v8, v24, s6 -; GFX9-NEXT: v_perm_b32 v7, v7, v23, s6 -; GFX9-NEXT: v_perm_b32 v6, v6, v22, s6 -; GFX9-NEXT: v_perm_b32 v5, v5, v21, s6 -; GFX9-NEXT: v_perm_b32 v4, v4, v20, s6 -; GFX9-NEXT: v_perm_b32 v3, v3, v19, s6 -; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 -; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 -; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; GFX9-NEXT: v_perm_b32 v15, v14, v32, s6 +; GFX9-NEXT: v_perm_b32 v14, v10, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s6 +; GFX9-NEXT: v_perm_b32 v12, v11, v9, s6 +; GFX9-NEXT: v_perm_b32 v11, v27, v29, s6 +; GFX9-NEXT: v_perm_b32 v10, v31, v28, s6 +; GFX9-NEXT: v_perm_b32 v9, v25, v26, s6 +; GFX9-NEXT: v_perm_b32 v8, v24, v8, s6 +; GFX9-NEXT: v_perm_b32 v7, v23, v7, s6 +; GFX9-NEXT: v_perm_b32 v6, v22, v6, s6 +; GFX9-NEXT: v_perm_b32 v5, v21, v5, s6 +; GFX9-NEXT: v_perm_b32 v4, v20, v4, s6 +; GFX9-NEXT: v_perm_b32 v3, v19, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v18, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v17, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v16, v0, s6 ; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -73611,9 +73621,10 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v33, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v49, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 @@ -73639,7 +73650,6 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v35, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 @@ -73905,11 +73915,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v21, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v21, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -73918,294 +73928,294 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_alignbit_b32 v15, v6, v5, 16 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_bfe_u32 v2, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_f32_e32 v5, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v4, v5, 16, 1 +; VI-NEXT: v_alignbit_b32 v14, v2, v3, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v5 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v13, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v11, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v9, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v7, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v4 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v3 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_bfe_u32 v3, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_add_f32_e32 v17, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v16, v17, 16, 1 ; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v17 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_bfe_u32 v16, v18, 16, 1 ; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v18 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v18, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v19, s4, v0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_alignbit_b32 v2, v17, v2, 16 +; VI-NEXT: v_bfe_u32 v17, v19, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v19 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_f32_e32 v0, s6, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v19, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v18, v18 +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_cndmask_b32_e64 v1, v17, v19, s[4:5] +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v18, v16, 16 ; VI-NEXT: s_branch .LBB103_5 ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -74227,10 +74237,10 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB103_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v21, 1 +; VI-NEXT: v_readlane_b32 s30, v21, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -74239,11 +74249,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 ; GFX9-NEXT: v_readfirstlane_b32 s30, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s31, v1 @@ -74252,311 +74262,311 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: s_cbranch_execnz .LBB103_4 ; GFX9-NEXT: .LBB103_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v4, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v14, v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc ; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v17, v2 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_bfe_u32 v2, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 ; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v18, v17 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v18 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v18, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v19, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v19, 16, v0 ; GFX9-NEXT: s_branch .LBB103_5 ; GFX9-NEXT: .LBB103_3: ; GFX9-NEXT: s_branch .LBB103_2 @@ -74578,10 +74588,10 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, s30 ; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: .LBB103_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -77026,7 +77036,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v22, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 @@ -77034,45 +77045,44 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_or_b32_e32 v37, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_or_b32_e32 v37, v22, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; SI-NEXT: v_or_b32_e32 v32, v9, v8 ; SI-NEXT: v_alignbit_b32 v8, v32, v37, 24 @@ -77084,23 +77094,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v8, v32, v37, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_or_b32_e32 v24, v12, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v25, v11, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_or_b32_e32 v23, v11, v8 -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 24 +; SI-NEXT: v_or_b32_e32 v24, v10, v8 +; SI-NEXT: v_alignbit_b32 v8, v24, v25, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v8, v24, v25, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 8 +; SI-NEXT: v_alignbit_b32 v8, v24, v25, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 ; SI-NEXT: v_or_b32_e32 v18, v42, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_or_b32_e32 v19, v14, v8 +; SI-NEXT: v_or_b32_e32 v19, v13, v8 ; SI-NEXT: v_alignbit_b32 v8, v19, v18, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -77111,7 +77121,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_or_b32_e32 v16, v25, v8 +; SI-NEXT: v_or_b32_e32 v16, v23, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 ; SI-NEXT: v_or_b32_e32 v17, v28, v8 ; SI-NEXT: v_alignbit_b32 v8, v17, v16, 24 @@ -77127,14 +77137,14 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v15, v21, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 ; SI-NEXT: v_or_b32_e32 v14, v62, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; SI-NEXT: v_or_b32_e32 v12, v34, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; SI-NEXT: v_or_b32_e32 v13, v30, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 ; SI-NEXT: v_or_b32_e32 v10, v50, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_or_b32_e32 v11, v48, v8 +; SI-NEXT: v_or_b32_e32 v11, v49, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 ; SI-NEXT: v_or_b32_e32 v9, v40, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 @@ -77152,17 +77162,17 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v8, v9, 8 -; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v56, v14, v15, 24 ; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 ; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 ; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 ; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v57, v13, v12, 8 ; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 -; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v19 ; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 ; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 @@ -77170,7 +77180,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 ; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 ; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 @@ -77181,14 +77191,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: .LBB105_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 -; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -77200,16 +77209,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v9, v9, v8 -; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -77221,7 +77230,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 @@ -77255,8 +77263,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -77278,24 +77287,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v21, v18 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v56, v14, v15, 24 ; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 ; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 ; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 ; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v57, v13, v12, 8 ; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 ; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 ; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 @@ -77304,52 +77311,54 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 ; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 ; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v24, v22, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v24, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v24 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v37, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_or_b32_e32 v32, v25, v21 +; SI-NEXT: v_or_b32_e32 v32, v23, v21 ; SI-NEXT: v_alignbit_b32 v21, v32, v37, 24 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -77359,13 +77368,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v21, v32, v37, 8 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 24 +; SI-NEXT: v_alignbit_b32 v21, v24, v25, 24 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v24, v25, 16 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 8 +; SI-NEXT: v_alignbit_b32 v21, v24, v25, 8 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v21, v19, v18, 24 @@ -77403,7 +77412,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v37 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 @@ -77422,48 +77431,48 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v26, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v54 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 ; SI-NEXT: v_or_b32_e32 v7, v21, v7 ; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v39 ; SI-NEXT: v_or_b32_e32 v7, v7, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v51 @@ -77492,9 +77501,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v35 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v48 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -77536,7 +77545,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v57 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v56 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -77554,7 +77563,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v57 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -77655,14 +77664,14 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -77670,12 +77679,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -78313,10 +78322,10 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 ; GFX9-NEXT: .LBB105_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 -; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s18, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0] @@ -78329,22 +78338,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -78373,16 +78382,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v18 ; GFX9-NEXT: s_branch .LBB105_5 ; GFX9-NEXT: .LBB105_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -78435,34 +78444,34 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB105_2 ; GFX9-NEXT: .LBB105_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s44 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s42 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 ; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 ; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 ; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 ; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 ; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v16, s55 +; GFX9-NEXT: v_mov_b32_e32 v61, s53 +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: v_mov_b32_e32 v59, s52 +; GFX9-NEXT: v_mov_b32_e32 v60, s51 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 -; GFX9-NEXT: v_mov_b32_e32 v60, s52 -; GFX9-NEXT: v_mov_b32_e32 v61, s51 ; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v62, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -78494,45 +78503,45 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v22, s12 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s8 +; GFX9-NEXT: v_mov_b32_e32 v25, s6 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 ; GFX9-NEXT: .LBB105_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v18, v61, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v59, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v15, v62, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -78544,7 +78553,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -78556,7 +78565,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -78568,7 +78577,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -78620,8 +78629,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -81446,7 +81455,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_readfirstlane_b32 s47, v22 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v32, s30, 0 ; SI-NEXT: v_writelane_b32 v32, s31, 1 @@ -81456,42 +81465,42 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v32, s37, 5 ; SI-NEXT: v_writelane_b32 v32, s38, 6 ; SI-NEXT: v_writelane_b32 v32, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s74, v30 +; SI-NEXT: v_readfirstlane_b32 s73, v30 ; SI-NEXT: v_readfirstlane_b32 s61, v29 -; SI-NEXT: v_readfirstlane_b32 s63, v28 -; SI-NEXT: v_readfirstlane_b32 s59, v27 +; SI-NEXT: v_readfirstlane_b32 s62, v28 +; SI-NEXT: v_readfirstlane_b32 s58, v27 ; SI-NEXT: v_readfirstlane_b32 s60, v26 -; SI-NEXT: v_readfirstlane_b32 s57, v25 -; SI-NEXT: v_readfirstlane_b32 s58, v24 -; SI-NEXT: v_readfirstlane_b32 s47, v23 -; SI-NEXT: v_readfirstlane_b32 s56, v22 -; SI-NEXT: v_readfirstlane_b32 s44, v21 -; SI-NEXT: v_readfirstlane_b32 s34, v19 -; SI-NEXT: v_readfirstlane_b32 s37, v18 -; SI-NEXT: v_readfirstlane_b32 s94, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v16 -; SI-NEXT: v_readfirstlane_b32 s90, v15 -; SI-NEXT: v_readfirstlane_b32 s93, v14 -; SI-NEXT: v_readfirstlane_b32 s79, v13 -; SI-NEXT: v_readfirstlane_b32 s39, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s56, v25 +; SI-NEXT: v_readfirstlane_b32 s57, v24 +; SI-NEXT: v_readfirstlane_b32 s46, v23 +; SI-NEXT: v_readfirstlane_b32 s37, v21 +; SI-NEXT: v_readfirstlane_b32 s39, v20 +; SI-NEXT: v_readfirstlane_b32 s31, v19 +; SI-NEXT: v_readfirstlane_b32 s36, v18 +; SI-NEXT: v_readfirstlane_b32 s93, v17 +; SI-NEXT: v_readfirstlane_b32 s30, v16 +; SI-NEXT: v_readfirstlane_b32 s89, v15 +; SI-NEXT: v_readfirstlane_b32 s92, v14 +; SI-NEXT: v_readfirstlane_b32 s77, v13 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s35, v11 ; SI-NEXT: v_readfirstlane_b32 s38, v10 -; SI-NEXT: v_readfirstlane_b32 s30, v9 -; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s92, v7 -; SI-NEXT: v_readfirstlane_b32 s95, v6 -; SI-NEXT: v_readfirstlane_b32 s89, v5 -; SI-NEXT: v_readfirstlane_b32 s91, v4 -; SI-NEXT: v_readfirstlane_b32 s78, v3 -; SI-NEXT: v_readfirstlane_b32 s88, v2 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s77, v0 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 +; SI-NEXT: v_readfirstlane_b32 s95, v9 +; SI-NEXT: v_readfirstlane_b32 s34, v8 +; SI-NEXT: v_readfirstlane_b32 s91, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v6 +; SI-NEXT: v_readfirstlane_b32 s79, v5 +; SI-NEXT: v_readfirstlane_b32 s90, v4 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s78, v2 +; SI-NEXT: v_readfirstlane_b32 s74, v1 +; SI-NEXT: v_readfirstlane_b32 s75, v0 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s9, v31 @@ -81512,40 +81521,40 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s10, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s13, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s13, v31 +; SI-NEXT: v_readfirstlane_b32 s14, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: v_readfirstlane_b32 s40, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s14, v31 +; SI-NEXT: v_readfirstlane_b32 s15, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s43, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 +; SI-NEXT: v_readfirstlane_b32 s44, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s42, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: v_readfirstlane_b32 s63, v31 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -81576,91 +81585,91 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s91, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_and_b32 s4, s90, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s36, 8 +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s39, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s93, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s31, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_and_b32 s4, s62, 0xff ; SI-NEXT: s_lshl_b32 s5, s61, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s75, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_and_b32 s4, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_and_b32 s4, s13, 0xff ; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 @@ -81689,91 +81698,91 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s7, s8, 8 -; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_and_b32 s7, s13, 0xff ; SI-NEXT: s_lshl_b32 s8, s10, 8 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s41, 0xff -; SI-NEXT: s_lshl_b32 s9, s13, 8 +; SI-NEXT: s_and_b32 s8, s40, 0xff +; SI-NEXT: s_lshl_b32 s9, s14, 8 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_and_b32 s9, s43, 0xff -; SI-NEXT: s_lshl_b32 s10, s14, 8 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_lshl_b32 s10, s15, 8 +; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s45, 0xff -; SI-NEXT: s_lshl_b32 s11, s40, 8 -; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_and_b32 s10, s44, 0xff +; SI-NEXT: s_lshl_b32 s11, s41, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_and_b32 s11, s45, 0xff ; SI-NEXT: s_lshl_b32 s12, s42, 8 -; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s72, s72, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_and_b32 s12, s75, 0xff -; SI-NEXT: s_lshl_b32 s13, s62, 8 -; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_and_b32 s12, s72, 0xff +; SI-NEXT: s_lshl_b32 s13, s59, 8 +; SI-NEXT: s_add_i32 s73, s73, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s13, s74, 0xff -; SI-NEXT: s_lshl_b32 s14, s72, 8 -; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s13, s73, 0xff +; SI-NEXT: s_lshl_b32 s14, s63, 8 +; SI-NEXT: s_add_i32 s62, s62, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s63, 0xff +; SI-NEXT: s_and_b32 s14, s62, 0xff ; SI-NEXT: s_lshl_b32 s15, s61, 8 ; SI-NEXT: s_add_i32 s60, s60, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s60, 0xff -; SI-NEXT: s_lshl_b32 s40, s59, 8 -; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_lshl_b32 s40, s58, 8 +; SI-NEXT: s_add_i32 s57, s57, 3 ; SI-NEXT: s_or_b32 s15, s40, s15 -; SI-NEXT: s_and_b32 s40, s58, 0xff -; SI-NEXT: s_lshl_b32 s41, s57, 8 -; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_and_b32 s40, s57, 0xff +; SI-NEXT: s_lshl_b32 s41, s56, 8 +; SI-NEXT: s_add_i32 s47, s47, 3 ; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: s_and_b32 s41, s56, 0xff -; SI-NEXT: s_lshl_b32 s42, s47, 8 -; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_and_b32 s41, s47, 0xff +; SI-NEXT: s_lshl_b32 s42, s46, 8 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_and_b32 s42, s46, 0xff -; SI-NEXT: s_lshl_b32 s43, s44, 8 -; SI-NEXT: s_add_i32 s37, s37, 3 +; SI-NEXT: s_and_b32 s42, s39, 0xff +; SI-NEXT: s_lshl_b32 s43, s37, 8 +; SI-NEXT: s_add_i32 s36, s36, 3 ; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: s_and_b32 s43, s37, 0xff -; SI-NEXT: s_lshl_b32 s44, s34, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_and_b32 s43, s36, 0xff +; SI-NEXT: s_lshl_b32 s44, s31, 8 +; SI-NEXT: s_add_i32 s30, s30, 3 ; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_and_b32 s44, s31, 0xff -; SI-NEXT: s_lshl_b32 s45, s94, 8 -; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_and_b32 s44, s30, 0xff +; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_add_i32 s92, s92, 3 ; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: s_and_b32 s45, s93, 0xff -; SI-NEXT: s_lshl_b32 s46, s90, 8 -; SI-NEXT: s_add_i32 s39, s39, 3 +; SI-NEXT: s_and_b32 s45, s92, 0xff +; SI-NEXT: s_lshl_b32 s46, s89, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 ; SI-NEXT: s_or_b32 s45, s46, s45 -; SI-NEXT: s_and_b32 s46, s39, 0xff -; SI-NEXT: s_lshl_b32 s47, s79, 8 +; SI-NEXT: s_and_b32 s46, s88, 0xff +; SI-NEXT: s_lshl_b32 s47, s77, 8 ; SI-NEXT: s_add_i32 s38, s38, 3 ; SI-NEXT: s_or_b32 s46, s47, s46 ; SI-NEXT: s_and_b32 s47, s38, 0xff -; SI-NEXT: s_lshl_b32 s56, s36, 8 -; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_lshl_b32 s56, s35, 8 +; SI-NEXT: s_add_i32 s34, s34, 3 ; SI-NEXT: s_or_b32 s47, s56, s47 -; SI-NEXT: s_and_b32 s56, s35, 0xff -; SI-NEXT: s_lshl_b32 s57, s30, 8 -; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_and_b32 s56, s34, 0xff +; SI-NEXT: s_lshl_b32 s57, s95, 8 +; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_or_b32 s56, s57, s56 -; SI-NEXT: s_and_b32 s57, s95, 0xff -; SI-NEXT: s_lshl_b32 s58, s92, 8 -; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_and_b32 s57, s94, 0xff +; SI-NEXT: s_lshl_b32 s58, s91, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s57, s58, s57 -; SI-NEXT: s_and_b32 s58, s91, 0xff -; SI-NEXT: s_lshl_b32 s59, s89, 8 -; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_and_b32 s58, s90, 0xff +; SI-NEXT: s_lshl_b32 s59, s79, 8 +; SI-NEXT: s_add_i32 s78, s78, 3 ; SI-NEXT: s_or_b32 s58, s59, s58 -; SI-NEXT: s_and_b32 s59, s88, 0xff -; SI-NEXT: s_lshl_b32 s60, s78, 8 -; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_and_b32 s59, s78, 0xff +; SI-NEXT: s_lshl_b32 s60, s76, 8 +; SI-NEXT: s_add_i32 s75, s75, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 @@ -81782,8 +81791,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s59, s60, s59 -; SI-NEXT: s_and_b32 s60, s77, 0xff -; SI-NEXT: s_lshl_b32 s61, s76, 8 +; SI-NEXT: s_and_b32 s60, s75, 0xff +; SI-NEXT: s_lshl_b32 s61, s74, 8 ; SI-NEXT: s_and_b32 s28, s28, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff @@ -86550,63 +86559,63 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v21, 16 ; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 ; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill @@ -86617,36 +86626,36 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, v1, v5, 16 +; SI-NEXT: v_alignbit_b32 v19, v17, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v20, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v20, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 +; SI-NEXT: v_alignbit_b32 v1, v19, v20, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, v1, v8, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v6, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v14, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v14, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v1, v16, v14, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v10, v1, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v23, 16 ; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -86656,13 +86665,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_alignbit_b32 v6, v1, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_alignbit_b32 v2, v1, v37, 16 ; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 ; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 @@ -86673,24 +86682,24 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 -; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 +; SI-NEXT: v_alignbit_b32 v21, v18, v25, 16 +; SI-NEXT: v_alignbit_b32 v15, v12, v34, 16 ; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 +; SI-NEXT: v_mov_b32_e32 v31, v25 ; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v36, v21, v6, 24 +; SI-NEXT: v_alignbit_b32 v25, v15, v3, 24 ; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 ; SI-NEXT: v_mov_b32_e32 v53, v32 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 ; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_alignbit_b32 v55, v21, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v6, 8 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_alignbit_b32 v52, v15, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v15, v3, 8 ; SI-NEXT: v_mov_b32_e32 v37, v33 ; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill @@ -86702,27 +86711,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 ; SI-NEXT: v_mov_b32_e32 v28, v26 ; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v16 ; SI-NEXT: v_mov_b32_e32 v26, v42 ; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 ; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; SI-NEXT: v_mov_b32_e32 v29, v43 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v21 ; SI-NEXT: v_mov_b32_e32 v34, v44 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v15 +; SI-NEXT: v_mov_b32_e32 v33, v45 +; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 ; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v48 ; SI-NEXT: v_mov_b32_e32 v48, v32 ; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: v_mov_b32_e32 v50, v25 ; SI-NEXT: v_mov_b32_e32 v25, v36 ; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 @@ -86749,16 +86758,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 ; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -86786,19 +86793,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v19, v17, 16 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -86806,12 +86813,12 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v12, v6, 16 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: v_alignbit_b32 v50, v15, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v15, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v15, v3, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v15 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 @@ -86824,34 +86831,34 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 ; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 ; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 ; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 +; SI-NEXT: v_alignbit_b32 v21, v18, v9, 16 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: v_alignbit_b32 v25, v21, v6, 24 +; SI-NEXT: v_alignbit_b32 v55, v21, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v6, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -86875,22 +86882,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 +; SI-NEXT: v_alignbit_b32 v22, v19, v20, 24 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v22, v19, v20, 16 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 +; SI-NEXT: v_alignbit_b32 v22, v19, v20, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 +; SI-NEXT: v_alignbit_b32 v22, v16, v14, 24 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v16, v14, 16 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v22, v16, v14, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -86903,7 +86910,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 ; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v21 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 @@ -86923,8 +86933,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 ; SI-NEXT: .LBB109_3: ; %end ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 @@ -86933,11 +86942,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 ; SI-NEXT: v_or_b32_e32 v22, v22, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 @@ -86973,24 +86982,24 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v41 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen @@ -86999,42 +87008,42 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen @@ -87060,8 +87069,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v42 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -87083,12 +87092,12 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 @@ -87109,11 +87118,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v57 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v44 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -87136,11 +87145,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -87174,20 +87183,20 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_mov_b32_e32 v49, v48 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v37, v33 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_mov_b32_e32 v33, v45 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: v_mov_b32_e32 v34, v44 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_mov_b32_e32 v31, v25 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: v_mov_b32_e32 v29, v43 @@ -87210,14 +87219,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -87229,33 +87238,33 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: @@ -87689,10 +87698,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 @@ -87753,28 +87762,28 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v19, s42 ; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 ; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 ; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 ; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 ; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 ; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 ; VI-NEXT: v_mov_b32_e32 v18, s67 ; VI-NEXT: v_mov_b32_e32 v62, s65 ; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v58, s64 +; VI-NEXT: v_mov_b32_e32 v60, s55 +; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v59, s54 +; VI-NEXT: v_mov_b32_e32 v61, s52 ; VI-NEXT: v_mov_b32_e32 v57, s53 ; VI-NEXT: v_mov_b32_e32 v47, s51 ; VI-NEXT: v_mov_b32_e32 v56, s50 @@ -87825,14 +87834,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index e66762f1e02c2..953dcc8cfee59 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -2177,46 +2177,46 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 @@ -2239,129 +2239,129 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -2372,6 +2372,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -2389,29 +2390,28 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_4 @@ -2434,61 +2434,61 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2496,6 +2496,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -2513,29 +2514,28 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 @@ -2558,45 +2558,45 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36i16: @@ -3658,8 +3658,10 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v36i16_to_v18i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -3678,7 +3680,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3800,8 +3802,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4364,23 +4367,23 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -4427,35 +4430,35 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s8, s8, s18 @@ -4470,7 +4473,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -4485,7 +4488,6 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -4500,6 +4502,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s8 ; VI-NEXT: v_mov_b32_e32 v12, s7 ; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v17 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -4551,11 +4554,11 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -4574,15 +4577,14 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -4597,6 +4599,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB15_3: ; %end @@ -5053,6 +5056,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -5070,29 +5074,28 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB16_4 @@ -5115,61 +5118,61 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5177,6 +5180,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -5194,29 +5198,28 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB16_4 @@ -5239,45 +5242,45 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36f16: @@ -5578,27 +5581,26 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: .LBB17_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v34 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 @@ -6521,8 +6523,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v36f16_to_v18i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -6541,7 +6545,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6663,8 +6667,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7363,23 +7368,23 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -7400,13 +7405,13 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -7519,11 +7524,11 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -9181,46 +9186,46 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 @@ -9243,129 +9248,129 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -9376,6 +9381,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -9393,29 +9399,28 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB28_4 @@ -9438,61 +9443,61 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9500,6 +9505,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -9517,29 +9523,28 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB28_4 @@ -9562,45 +9567,45 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16: @@ -10655,8 +10660,10 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v36i16_to_v18f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -10675,7 +10682,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10797,8 +10804,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11361,23 +11369,23 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -11424,35 +11432,35 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s8, s8, s18 @@ -11467,7 +11475,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -11482,7 +11490,6 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -11497,6 +11504,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s8 ; VI-NEXT: v_mov_b32_e32 v12, s7 ; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v17 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -11548,11 +11556,11 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -11571,15 +11579,14 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -11594,6 +11601,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB31_3: ; %end @@ -12050,6 +12058,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -12067,29 +12076,28 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_4 @@ -12112,61 +12120,61 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB32_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -12174,6 +12182,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -12191,29 +12200,28 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_4 @@ -12236,45 +12244,45 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16: @@ -12557,27 +12565,26 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v34 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 @@ -13520,8 +13527,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v36f16_to_v18f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -13540,7 +13549,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13662,8 +13671,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14362,23 +14372,23 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -14399,13 +14409,13 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -14518,11 +14528,11 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -15379,46 +15389,46 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 @@ -15441,129 +15451,129 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -15574,6 +15584,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -15591,29 +15602,28 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB40_4 @@ -15636,61 +15646,61 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15698,6 +15708,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -15715,29 +15726,28 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB40_4 @@ -15760,45 +15770,45 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36i16: @@ -16870,8 +16880,10 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v36i16_to_v9i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -16890,7 +16902,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17012,8 +17024,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17576,23 +17589,23 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -17639,35 +17652,35 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s8, s8, s18 @@ -17682,7 +17695,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -17697,7 +17710,6 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -17712,6 +17724,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v11, s8 ; VI-NEXT: v_mov_b32_e32 v12, s7 ; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v17 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -17763,11 +17776,11 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -17786,15 +17799,14 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -17809,6 +17821,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB43_3: ; %end @@ -18265,6 +18278,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -18282,29 +18296,28 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB44_4 @@ -18327,61 +18340,61 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -18389,6 +18402,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -18406,29 +18420,28 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB44_4 @@ -18451,45 +18464,45 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36f16: @@ -18800,27 +18813,26 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v34 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 @@ -19743,8 +19755,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v36f16_to_v9i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -19763,7 +19777,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19885,8 +19899,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -20585,23 +20600,23 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -20622,13 +20637,13 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -20741,11 +20756,11 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -20905,46 +20920,46 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 @@ -20958,129 +20973,129 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -21091,6 +21106,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -21108,29 +21124,28 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_4 @@ -21144,61 +21159,61 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v12, v12, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -21206,6 +21221,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -21223,29 +21239,28 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_4 @@ -21259,45 +21274,45 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16: @@ -22316,8 +22331,10 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v36i16_to_v9f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -22336,7 +22353,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22458,8 +22475,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23022,23 +23040,23 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -23085,35 +23103,35 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s8, s8, s18 @@ -23128,7 +23146,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -23143,7 +23161,6 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -23158,6 +23175,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v11, s8 ; VI-NEXT: v_mov_b32_e32 v12, s7 ; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v17 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -23209,11 +23227,11 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -23232,15 +23250,14 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -23255,6 +23272,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB51_3: ; %end @@ -23693,6 +23711,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -23710,29 +23729,28 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB52_4 @@ -23746,61 +23764,61 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_or_b32_sdwa v12, v12, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23808,6 +23826,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -23825,29 +23844,28 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB52_4 @@ -23861,45 +23879,45 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v34, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v32, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v30, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v28, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v27, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v26, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v25, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v23, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v22, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v21, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v19, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v36, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v35, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v34, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v33, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v32, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v31, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v25, v11, s4 +; GFX9-NEXT: v_perm_b32 v13, v23, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v22, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v21, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v20, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v19, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16: @@ -24055,15 +24073,15 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: s_lshr_b32 s8, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 ; SI-NEXT: s_lshr_b32 s8, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: s_lshr_b32 s8, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: s_lshr_b32 s8, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 ; SI-NEXT: s_lshr_b32 s8, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: s_lshr_b32 s8, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 ; SI-NEXT: s_lshr_b32 s8, s26, 16 @@ -24073,13 +24091,13 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: s_lshr_b32 s8, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 ; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 ; SI-NEXT: s_lshr_b32 s8, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 ; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 ; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 ; SI-NEXT: s_lshr_b32 s8, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 ; SI-NEXT: s_lshr_b32 s8, s18, 16 @@ -24087,137 +24105,136 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 ; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_add_f64 v[36:37], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 ; SI-NEXT: v_or_b32_e32 v33, v33, v34 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -24225,96 +24242,95 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -24324,16 +24340,16 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; @@ -25109,8 +25125,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v36f16_to_v9f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v18 ; VI-NEXT: v_mov_b32_e32 v32, v17 ; VI-NEXT: v_mov_b32_e32 v33, v16 ; VI-NEXT: v_mov_b32_e32 v34, v15 @@ -25129,7 +25147,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v2 ; VI-NEXT: v_mov_b32_e32 v40, v1 ; VI-NEXT: v_mov_b32_e32 v41, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25251,8 +25269,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -25951,23 +25970,23 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -25988,13 +26007,13 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -26107,11 +26126,11 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -26710,27 +26729,27 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_add_u16_e32 v35, 3, v35 +; VI-NEXT: v_add_u16_e32 v36, 3, v36 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_add_u16_e32 v34, 3, v34 +; VI-NEXT: v_add_u16_e32 v35, 3, v35 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_add_u16_e32 v33, 3, v33 +; VI-NEXT: v_add_u16_e32 v34, 3, v34 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v33 ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_add_u16_e32 v31, 3, v31 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 @@ -26759,7 +26778,15 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 ; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -26773,8 +26800,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 ; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 ; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 @@ -26784,122 +26809,116 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 ; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v36i16_to_v36f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v34, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v33, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v32, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v31, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v30, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v29, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v28, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v27, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v26, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v25, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v24, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v22, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v23, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v20, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v19, v1, s6 -; GFX9-NEXT: v_perm_b32 v0, v18, v0, s6 +; GFX9-NEXT: v_perm_b32 v17, v36, v17, s6 +; GFX9-NEXT: v_perm_b32 v16, v35, v16, s6 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v32, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v33, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v34, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v25, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v24, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v23, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v21, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v19, v15, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 ; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v19, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v20, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v21, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v23, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v22, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v24, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v25, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v26, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v28, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v29, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v30, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v32, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v33, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v34, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v32, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v34, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v24, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v23, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v21, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v19, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v35, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v36, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16: @@ -27090,20 +27109,20 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 @@ -27167,20 +27186,20 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 @@ -27239,48 +27258,48 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -27342,20 +27361,20 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -27530,9 +27549,9 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 ; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 ; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 ; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 @@ -27548,23 +27567,23 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 ; GFX9-NEXT: v_pk_add_u16 v19, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_u16 v18, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_pk_add_u16 v18, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 ; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 -; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 @@ -27575,10 +27594,10 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -27589,8 +27608,8 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 @@ -27601,10 +27620,10 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v18, s18 ; GFX9-NEXT: v_mov_b32_e32 v21, s17 ; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s43 -; GFX9-NEXT: v_mov_b32_e32 v23, s42 -; GFX9-NEXT: v_mov_b32_e32 v24, s41 -; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v25, s43 +; GFX9-NEXT: v_mov_b32_e32 v24, s42 +; GFX9-NEXT: v_mov_b32_e32 v23, s41 +; GFX9-NEXT: v_mov_b32_e32 v22, s40 ; GFX9-NEXT: v_mov_b32_e32 v26, s15 ; GFX9-NEXT: v_mov_b32_e32 v27, s14 ; GFX9-NEXT: v_mov_b32_e32 v28, s13 @@ -27623,6 +27642,7 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 @@ -27634,9 +27654,9 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 @@ -27646,11 +27666,10 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 ; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v10, v22, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v23, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v25, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v20 ; GFX9-NEXT: v_mov_b32_e32 v1, v21 @@ -27841,310 +27860,315 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v35 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_or_b32_e32 v31, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v21 +; SI-NEXT: v_or_b32_e32 v36, v20, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_or_b32_e32 v39, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v33, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v50, v20, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_or_b32_e32 v48, v24, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v36, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v39, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v49, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v37, v24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v50, v21, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v49, v21, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_or_b32_e32 v38, v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v35, v25, v23 -; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v8, v3, v21, 16 +; SI-NEXT: v_or_b32_e32 v34, v26, v24 +; SI-NEXT: v_or_b32_e32 v31, v27, v25 ; SI-NEXT: v_or_b32_e32 v15, v15, v19 ; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_alignbit_b32 v55, v39, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v36, v22, 16 -; SI-NEXT: v_alignbit_b32 v53, v33, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v31, v23, 16 -; SI-NEXT: v_alignbit_b32 v51, v13, v24, 16 -; SI-NEXT: v_alignbit_b32 v19, v10, v19, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 -; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_alignbit_b32 v55, v50, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v39, v23, 16 +; SI-NEXT: v_alignbit_b32 v53, v36, v22, 16 +; SI-NEXT: v_alignbit_b32 v52, v33, v24, 16 +; SI-NEXT: v_alignbit_b32 v51, v17, v25, 16 +; SI-NEXT: v_alignbit_b32 v19, v13, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v10, v18, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v14, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v38 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_or_b32_e32 v16, v16, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload @@ -28171,27 +28195,27 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 +; VI-NEXT: v_add_f16_e32 v36, 0x200, v36 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 +; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 +; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -28220,7 +28244,15 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 ; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 ; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28234,8 +28266,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 ; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 ; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 @@ -28245,123 +28275,117 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 ; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_or_b32_sdwa v0, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v36f16_to_v36i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v34, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v33, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v32, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v31, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v30, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v29, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v28, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v27, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v26, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v25, v7, s6 +; GFX9-NEXT: v_perm_b32 v17, v36, v17, s6 +; GFX9-NEXT: v_perm_b32 v16, v35, v16, s6 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v32, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v33, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v6, v24, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v22, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v23, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v20, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v19, v1, s6 -; GFX9-NEXT: v_perm_b32 v0, v18, v0, s6 +; GFX9-NEXT: v_perm_b32 v8, v34, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v25, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v24, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v23, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v21, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v19, v15, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 ; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v19, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v20, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v21, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v23, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v22, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v24, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v25, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v26, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v27, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v28, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v29, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v30, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v32, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v33, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v34, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v32, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v33, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v34, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v24, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v23, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v21, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v19, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v35, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v36, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16: @@ -28520,301 +28544,307 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_or_b32_e32 v24, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_or_b32_e32 v27, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_or_b32_e32 v32, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v35, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v6, v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v39, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v48, v17, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_or_b32_e32 v31, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v28, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v29 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v23, v23, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v37, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 +; SI-NEXT: v_or_b32_e32 v33, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v29, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v24, v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v13, v13, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_or_b32_e32 v25, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v22, v22, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 -; SI-NEXT: v_or_b32_e32 v15, v15, v33 -; SI-NEXT: v_or_b32_e32 v29, v29, v32 -; SI-NEXT: v_or_b32_e32 v26, v26, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v30 -; SI-NEXT: v_or_b32_e32 v8, v8, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_alignbit_b32 v36, v19, v36, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v35, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v34, 16 -; SI-NEXT: v_alignbit_b32 v33, v27, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v24, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v23, v31, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v30, 16 -; SI-NEXT: v_alignbit_b32 v12, v3, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v8, v3, v14, 16 +; SI-NEXT: v_or_b32_e32 v11, v11, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_alignbit_b32 v52, v31, v13, 16 +; SI-NEXT: v_alignbit_b32 v54, v39, v16, 16 +; SI-NEXT: v_alignbit_b32 v53, v35, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v32, v17, 16 +; SI-NEXT: v_alignbit_b32 v50, v27, v18, 16 +; SI-NEXT: v_alignbit_b32 v49, v24, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v20, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v12, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v22, v22, v36 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 ; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -28963,20 +28993,20 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-LABEL: bitcast_v36f16_to_v36i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s43, s29, 16 -; GFX9-NEXT: s_lshr_b32 s42, s28, 16 -; GFX9-NEXT: s_lshr_b32 s41, s27, 16 -; GFX9-NEXT: s_lshr_b32 s40, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s15, s24, 16 +; GFX9-NEXT: s_lshr_b32 s14, s23, 16 +; GFX9-NEXT: s_lshr_b32 s13, s22, 16 +; GFX9-NEXT: s_lshr_b32 s12, s21, 16 +; GFX9-NEXT: s_lshr_b32 s11, s20, 16 +; GFX9-NEXT: s_lshr_b32 s10, s19, 16 +; GFX9-NEXT: s_lshr_b32 s9, s18, 16 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -28987,48 +29017,48 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_cbranch_execnz .LBB59_4 ; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s43 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s42 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s40 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s15 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s14 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s13 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s12 +; GFX9-NEXT: v_pk_add_f16 v5, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s11 +; GFX9-NEXT: v_pk_add_f16 v4, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s10 +; GFX9-NEXT: v_pk_add_f16 v19, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s9 +; GFX9-NEXT: v_pk_add_f16 v18, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s8 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v13 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s7 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s6 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 -; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 -; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 -; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 -; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 -; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 -; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 -; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 -; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 -; GFX9-NEXT: v_pk_add_f16 v5, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 -; GFX9-NEXT: v_pk_add_f16 v4, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 -; GFX9-NEXT: v_pk_add_f16 v19, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 -; GFX9-NEXT: v_pk_add_f16 v18, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 -; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 -; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 @@ -29039,10 +29069,10 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -29053,8 +29083,8 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: .LBB59_4: ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 @@ -29065,20 +29095,20 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v18, s18 ; GFX9-NEXT: v_mov_b32_e32 v21, s17 ; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s43 +; GFX9-NEXT: v_mov_b32_e32 v25, s6 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 ; GFX9-NEXT: v_mov_b32_e32 v23, s42 -; GFX9-NEXT: v_mov_b32_e32 v24, s41 -; GFX9-NEXT: v_mov_b32_e32 v25, s40 -; GFX9-NEXT: v_mov_b32_e32 v26, s15 -; GFX9-NEXT: v_mov_b32_e32 v27, s14 -; GFX9-NEXT: v_mov_b32_e32 v28, s13 -; GFX9-NEXT: v_mov_b32_e32 v29, s12 -; GFX9-NEXT: v_mov_b32_e32 v30, s11 -; GFX9-NEXT: v_mov_b32_e32 v31, s10 -; GFX9-NEXT: v_mov_b32_e32 v32, s9 -; GFX9-NEXT: v_mov_b32_e32 v33, s8 -; GFX9-NEXT: v_mov_b32_e32 v34, s7 -; GFX9-NEXT: v_mov_b32_e32 v35, s6 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s15 +; GFX9-NEXT: v_mov_b32_e32 v28, s14 +; GFX9-NEXT: v_mov_b32_e32 v29, s13 +; GFX9-NEXT: v_mov_b32_e32 v30, s12 +; GFX9-NEXT: v_mov_b32_e32 v31, s11 +; GFX9-NEXT: v_mov_b32_e32 v32, s10 +; GFX9-NEXT: v_mov_b32_e32 v33, s9 +; GFX9-NEXT: v_mov_b32_e32 v34, s8 +; GFX9-NEXT: v_mov_b32_e32 v35, s7 ; GFX9-NEXT: .LBB59_5: ; %end ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 @@ -29087,6 +29117,7 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 @@ -29098,9 +29129,9 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 @@ -29110,11 +29141,10 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 ; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v10, v22, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v23, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v25, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v20 ; GFX9-NEXT: v_mov_b32_e32 v1, v21 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index b8091d8256457..831fea8e8e49c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -2010,8 +2010,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; SI-LABEL: bitcast_v10f64_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -2019,18 +2019,18 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2055,8 +2055,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; VI-LABEL: bitcast_v10f64_to_v20i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -2064,18 +2064,18 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2100,8 +2100,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; GFX9-LABEL: bitcast_v10f64_to_v20i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -2109,18 +2109,18 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -3853,10 +3853,12 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v40i16_to_v20i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -3877,7 +3879,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4011,10 +4013,11 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4639,18 +4642,20 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -4660,12 +4665,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -4712,53 +4715,53 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -4849,15 +4852,14 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -4872,6 +4874,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -7152,10 +7155,12 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v40f16_to_v20i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -7176,7 +7181,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7310,10 +7315,11 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8099,18 +8105,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -8120,12 +8128,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -8146,13 +8152,13 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -8210,13 +8216,13 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v18, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v20, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_or_b32_e32 v18, v21, v20 ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -8235,7 +8241,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v35, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v1 ; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 ; GFX9-NEXT: s_lshr_b32 s43, s26, 16 @@ -8248,7 +8254,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s9, s19, 16 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 @@ -8257,7 +8263,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s16, s40 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 @@ -8270,22 +8276,21 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s29, s6 ; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s40 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -8298,7 +8303,8 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 -; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB19_3 ; GFX9-NEXT: .LBB19_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -8315,7 +8321,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 -; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s40, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] @@ -8328,7 +8334,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s6, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] @@ -9763,8 +9769,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; SI-LABEL: bitcast_v10f64_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -9772,18 +9778,18 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -9808,8 +9814,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; VI-LABEL: bitcast_v10f64_to_v20f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -9817,18 +9823,18 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -9853,8 +9859,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; GFX9-LABEL: bitcast_v10f64_to_v20f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -9862,18 +9868,18 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 @@ -11569,10 +11575,12 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v40i16_to_v20f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -11593,7 +11601,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11727,10 +11735,11 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -12355,18 +12364,20 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -12376,12 +12387,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -12428,53 +12437,53 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -12565,15 +12574,14 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -12588,6 +12596,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -14853,10 +14862,12 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v40f16_to_v20f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -14877,7 +14888,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15011,10 +15022,11 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15800,18 +15812,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -15821,12 +15835,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -15847,13 +15859,13 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -15911,13 +15923,13 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v18, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v20, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_or_b32_e32 v18, v21, v20 ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -15936,7 +15948,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v1 ; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 ; GFX9-NEXT: s_lshr_b32 s43, s26, 16 @@ -15949,7 +15961,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: s_lshr_b32 s9, s19, 16 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 @@ -15958,7 +15970,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s16, s40 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 @@ -15971,22 +15983,21 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s29, s6 ; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s40 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -15999,7 +16010,8 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 -; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB35_3 ; GFX9-NEXT: .LBB35_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -16016,7 +16028,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 -; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s40, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] @@ -16029,7 +16041,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s6, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] @@ -16744,8 +16756,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; SI-LABEL: bitcast_v10f64_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -16753,18 +16765,18 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -16789,8 +16801,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; VI-LABEL: bitcast_v10f64_to_v10i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -16798,18 +16810,18 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -16834,8 +16846,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; GFX9-LABEL: bitcast_v10f64_to_v10i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -16843,18 +16855,18 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 @@ -18597,10 +18609,12 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v40i16_to_v10i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -18621,7 +18635,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -18755,10 +18769,11 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -19383,18 +19398,20 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -19404,12 +19421,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19456,53 +19471,53 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -19593,15 +19608,14 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -19616,6 +19630,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -21906,10 +21921,12 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v40f16_to_v10i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -21930,7 +21947,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22064,10 +22081,11 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -22853,18 +22871,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -22874,12 +22894,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -22900,13 +22918,13 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -22964,13 +22982,13 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v18, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v20, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_or_b32_e32 v18, v21, v20 ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -22989,7 +23007,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v35, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v1 ; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 ; GFX9-NEXT: s_lshr_b32 s43, s26, 16 @@ -23002,7 +23020,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s9, s19, 16 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 @@ -23011,7 +23029,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s16, s40 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 @@ -23024,22 +23042,21 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s29, s6 ; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s40 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -23052,7 +23069,8 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 -; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB47_3 ; GFX9-NEXT: .LBB47_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -23069,7 +23087,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 -; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s40, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] @@ -23082,7 +23100,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s6, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] @@ -23614,8 +23632,8 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -23721,8 +23739,8 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v8, v31, v8, s4 ; GFX9-NEXT: v_perm_b32 v9, v30, v9, s4 ; GFX9-NEXT: v_perm_b32 v10, v29, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v28, v11, s4 ; GFX9-NEXT: v_perm_b32 v12, v27, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v28, v11, s4 ; GFX9-NEXT: v_perm_b32 v13, v26, v13, s4 ; GFX9-NEXT: v_perm_b32 v14, v25, v14, s4 ; GFX9-NEXT: v_perm_b32 v15, v24, v15, s4 @@ -23885,164 +23903,164 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v17, s18 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v7, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v7, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v24, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 @@ -24063,7 +24081,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -24094,7 +24112,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_branch .LBB49_2 ; @@ -24103,18 +24121,18 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; VI-NEXT: v_mov_b32_e32 v20, s16 -; VI-NEXT: v_mov_b32_e32 v21, s17 ; VI-NEXT: v_mov_b32_e32 v18, s18 -; VI-NEXT: v_mov_b32_e32 v19, s19 ; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 @@ -24247,18 +24265,18 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v21, s17 ; GFX9-NEXT: v_mov_b32_e32 v18, s18 -; GFX9-NEXT: v_mov_b32_e32 v19, s19 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 @@ -24853,10 +24871,12 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v40i16_to_v10f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -24877,7 +24897,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25011,10 +25031,11 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -25639,18 +25660,20 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -25660,12 +25683,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -25712,53 +25733,53 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -25849,15 +25870,14 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -25872,6 +25892,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -26592,8 +26613,8 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -26699,8 +26720,8 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v8, v31, v8, s4 ; GFX9-NEXT: v_perm_b32 v9, v30, v9, s4 ; GFX9-NEXT: v_perm_b32 v10, v29, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v28, v11, s4 ; GFX9-NEXT: v_perm_b32 v12, v27, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v28, v11, s4 ; GFX9-NEXT: v_perm_b32 v13, v26, v13, s4 ; GFX9-NEXT: v_perm_b32 v14, v25, v14, s4 ; GFX9-NEXT: v_perm_b32 v15, v24, v15, s4 @@ -27205,18 +27226,18 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; VI-NEXT: v_mov_b32_e32 v20, s16 -; VI-NEXT: v_mov_b32_e32 v21, s17 ; VI-NEXT: v_mov_b32_e32 v18, s18 -; VI-NEXT: v_mov_b32_e32 v19, s19 ; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 @@ -27349,18 +27370,18 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v21, s17 ; GFX9-NEXT: v_mov_b32_e32 v18, s18 -; GFX9-NEXT: v_mov_b32_e32 v19, s19 ; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 @@ -28070,10 +28091,12 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v40f16_to_v10f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v20 ; VI-NEXT: v_mov_b32_e32 v32, v19 ; VI-NEXT: v_mov_b32_e32 v33, v18 ; VI-NEXT: v_mov_b32_e32 v34, v17 @@ -28094,7 +28117,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v41, v2 ; VI-NEXT: v_mov_b32_e32 v42, v1 ; VI-NEXT: v_mov_b32_e32 v43, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -28228,10 +28251,11 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, v20, v19 ; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -29017,18 +29041,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 @@ -29038,12 +29064,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -29064,13 +29088,13 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -29128,13 +29152,13 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v18, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v20, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_or_b32_e32 v18, v21, v20 ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -29153,7 +29177,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v35, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v1 ; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 ; GFX9-NEXT: s_lshr_b32 s43, s26, 16 @@ -29166,7 +29190,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: s_lshr_b32 s9, s19, 16 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 @@ -29175,7 +29199,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s16, s40 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 @@ -29188,22 +29212,21 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s29, s6 ; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s40 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -29216,7 +29239,8 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: v_mov_b32_e32 v11, s17 ; GFX9-NEXT: v_mov_b32_e32 v12, s18 -; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB55_3 ; GFX9-NEXT: .LBB55_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 @@ -29233,7 +29257,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 -; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s40, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] @@ -29246,7 +29270,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s6, v13 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] @@ -29997,10 +30021,10 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30013,11 +30037,11 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v38, 3, v38 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_add_u16_e32 v37, 3, v37 +; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_add_u16_e32 v36, 3, v36 +; VI-NEXT: v_add_u16_e32 v37, 3, v37 ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_add_u16_e32 v36, 3, v36 ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_add_u16_e32 v35, 3, v35 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 @@ -30051,6 +30075,10 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v2, v2, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; VI-NEXT: v_or_b32_sdwa v3, v3, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30064,8 +30092,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v30 ; VI-NEXT: v_or_b32_sdwa v10, v10, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; VI-NEXT: v_or_b32_sdwa v11, v11, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 ; VI-NEXT: v_or_b32_sdwa v12, v12, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 @@ -30081,14 +30107,12 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_sdwa v18, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -30097,24 +30121,24 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -30124,85 +30148,85 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 ; GFX9-NEXT: v_perm_b32 v18, v38, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v37, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v36, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v35, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v34, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v33, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v32, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v31, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v30, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v29, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v28, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v26, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v24, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v25, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v23, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v22, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v21, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s6 -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_perm_b32 v1, v36, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v37, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v30, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v31, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v32, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v33, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v34, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v29, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v27, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v26, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v25, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v24, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v23, v15, s6 +; GFX9-NEXT: v_perm_b32 v16, v22, v16, s6 +; GFX9-NEXT: v_perm_b32 v17, v21, v17, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v21, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v22, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v23, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v25, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v24, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v26, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v29, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v30, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v31, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v33, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v34, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v35, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v36, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v37, v17, s4 +; GFX9-NEXT: v_perm_b32 v1, v36, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v37, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v30, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v32, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v33, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v34, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v29, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v26, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v25, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v24, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v23, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v22, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v21, v17, s4 ; GFX9-NEXT: v_perm_b32 v18, v38, v18, s4 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -30416,38 +30440,38 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -30502,33 +30526,33 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) @@ -30573,15 +30597,15 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -30595,34 +30619,34 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -30630,7 +30654,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30639,7 +30663,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30656,22 +30680,22 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -30711,30 +30735,30 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -30916,9 +30940,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 ; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 ; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 ; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 @@ -30934,29 +30958,29 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 ; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 ; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 -; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 @@ -30967,10 +30991,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -30983,8 +31007,8 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 @@ -30995,10 +31019,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v20, s18 ; GFX9-NEXT: v_mov_b32_e32 v25, s17 ; GFX9-NEXT: v_mov_b32_e32 v24, s16 -; GFX9-NEXT: v_mov_b32_e32 v26, s43 -; GFX9-NEXT: v_mov_b32_e32 v27, s42 -; GFX9-NEXT: v_mov_b32_e32 v28, s41 -; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 ; GFX9-NEXT: v_mov_b32_e32 v31, s14 ; GFX9-NEXT: v_mov_b32_e32 v32, s13 @@ -31023,6 +31047,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 @@ -31034,9 +31059,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 @@ -31044,11 +31069,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 ; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v10, v26, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v27, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v12, v28, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v29, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v24 ; GFX9-NEXT: v_mov_b32_e32 v1, v25 @@ -31806,10 +31830,10 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -31822,11 +31846,11 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_add_f16_e32 v38, 0x200, v38 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v37, 0x200, v37 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_add_f16_e32 v36, 0x200, v36 +; VI-NEXT: v_add_f16_e32 v37, 0x200, v37 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v36, 0x200, v36 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 ; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -31860,6 +31884,10 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v2, v2, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; VI-NEXT: v_or_b32_sdwa v3, v3, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31873,8 +31901,6 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v30 ; VI-NEXT: v_or_b32_sdwa v10, v10, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; VI-NEXT: v_or_b32_sdwa v11, v11, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 ; VI-NEXT: v_or_b32_sdwa v12, v12, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 @@ -31890,14 +31916,12 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_sdwa v18, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -31906,24 +31930,24 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -31933,86 +31957,86 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 ; GFX9-NEXT: v_perm_b32 v18, v38, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v37, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v36, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v35, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v34, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v33, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v32, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v31, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v30, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v29, v9, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v8, v28, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v26, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v24, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v25, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v23, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v22, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v21, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v36, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v37, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v30, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v31, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v32, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v33, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v34, v7, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v8, v29, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v27, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v26, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v25, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v24, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v23, v15, s6 +; GFX9-NEXT: v_perm_b32 v16, v22, v16, s6 +; GFX9-NEXT: v_perm_b32 v17, v21, v17, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v21, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v22, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v23, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v25, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v24, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v26, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v28, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v29, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v30, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v31, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v33, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v34, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v35, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v36, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v37, v17, s4 +; GFX9-NEXT: v_perm_b32 v1, v36, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v37, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v30, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v31, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v32, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v33, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v34, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v29, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v26, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v25, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v24, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v23, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v22, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v21, v17, s4 ; GFX9-NEXT: v_perm_b32 v18, v38, v18, s4 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -32185,106 +32209,110 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v37, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v27, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_or_b32_e32 v37, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v39 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v49 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v34, v25, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 +; SI-NEXT: v_or_b32_e32 v34, v23, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -32297,39 +32325,52 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v12, v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_or_b32_e32 v28, v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_or_b32_e32 v33, v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v35, v35, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -32337,24 +32378,14 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 @@ -32362,11 +32393,8 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v33, v25, v33 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 @@ -32374,58 +32402,55 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v35, v35, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_or_b32_e32 v14, v14, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v49 +; SI-NEXT: v_or_b32_e32 v36, v23, v36 ; SI-NEXT: v_or_b32_e32 v20, v20, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v39 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v22, v22, v26 -; SI-NEXT: v_or_b32_e32 v21, v21, v27 -; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v15, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v38 -; SI-NEXT: v_or_b32_e32 v29, v29, v37 -; SI-NEXT: v_or_b32_e32 v11, v11, v51 -; SI-NEXT: v_or_b32_e32 v6, v6, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v52 -; SI-NEXT: v_alignbit_b32 v49, v19, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v20, v27, 16 -; SI-NEXT: v_alignbit_b32 v25, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v24, v35, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v33, v50, 16 -; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v12, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v51, 16 -; SI-NEXT: v_alignbit_b32 v36, v3, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v52, 16 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 +; SI-NEXT: v_or_b32_e32 v30, v30, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_alignbit_b32 v51, v36, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v19, v25, 16 +; SI-NEXT: v_alignbit_b32 v23, v14, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v35, v26, 16 +; SI-NEXT: v_alignbit_b32 v50, v33, v27, 16 +; SI-NEXT: v_alignbit_b32 v49, v28, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, v9, v52, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v53, 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v22, v22, v27 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v25, v18 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 @@ -32436,7 +32461,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -32448,7 +32473,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -32460,7 +32485,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -32472,7 +32497,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 ; SI-NEXT: v_or_b32_e32 v10, v10, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen @@ -32484,7 +32509,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; SI-NEXT: v_or_b32_e32 v7, v7, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen @@ -32496,7 +32521,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v38 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 @@ -32507,7 +32532,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -32676,20 +32701,20 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; GFX9-LABEL: bitcast_v40f16_to_v40i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s43, s29, 16 -; GFX9-NEXT: s_lshr_b32 s42, s28, 16 -; GFX9-NEXT: s_lshr_b32 s41, s27, 16 -; GFX9-NEXT: s_lshr_b32 s40, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s15, s24, 16 +; GFX9-NEXT: s_lshr_b32 s14, s23, 16 +; GFX9-NEXT: s_lshr_b32 s13, s22, 16 +; GFX9-NEXT: s_lshr_b32 s12, s21, 16 +; GFX9-NEXT: s_lshr_b32 s11, s20, 16 +; GFX9-NEXT: s_lshr_b32 s10, s19, 16 +; GFX9-NEXT: s_lshr_b32 s9, s18, 16 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 @@ -32702,68 +32727,68 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_cbranch_execnz .LBB59_4 ; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v26, 0x200 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s43 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s42 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s40 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s15 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s14 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s13 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s12 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s11 +; GFX9-NEXT: v_pk_add_f16 v22, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s10 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s9 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s8 +; GFX9-NEXT: v_pk_add_f16 v23, s4, v26 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s7 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v26 op_sel_hi:[1,0] ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s6 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v26 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 -; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 -; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 -; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 -; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 -; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 -; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 -; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 -; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 -; GFX9-NEXT: v_pk_add_f16 v23, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 -; GFX9-NEXT: v_pk_add_f16 v22, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 -; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 -; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 -; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 -; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -32774,74 +32799,74 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: .LBB59_3: ; GFX9-NEXT: s_branch .LBB59_2 ; GFX9-NEXT: .LBB59_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v23, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s21 ; GFX9-NEXT: v_mov_b32_e32 v22, s20 ; GFX9-NEXT: v_mov_b32_e32 v21, s19 ; GFX9-NEXT: v_mov_b32_e32 v20, s18 -; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 ; GFX9-NEXT: v_mov_b32_e32 v24, s16 -; GFX9-NEXT: v_mov_b32_e32 v26, s43 -; GFX9-NEXT: v_mov_b32_e32 v27, s42 -; GFX9-NEXT: v_mov_b32_e32 v28, s41 -; GFX9-NEXT: v_mov_b32_e32 v29, s40 -; GFX9-NEXT: v_mov_b32_e32 v30, s15 -; GFX9-NEXT: v_mov_b32_e32 v31, s14 -; GFX9-NEXT: v_mov_b32_e32 v32, s13 -; GFX9-NEXT: v_mov_b32_e32 v33, s12 -; GFX9-NEXT: v_mov_b32_e32 v34, s11 -; GFX9-NEXT: v_mov_b32_e32 v35, s10 -; GFX9-NEXT: v_mov_b32_e32 v36, s9 -; GFX9-NEXT: v_mov_b32_e32 v37, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v39, s6 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v31, s40 +; GFX9-NEXT: v_mov_b32_e32 v32, s15 +; GFX9-NEXT: v_mov_b32_e32 v33, s14 +; GFX9-NEXT: v_mov_b32_e32 v34, s13 +; GFX9-NEXT: v_mov_b32_e32 v35, s12 +; GFX9-NEXT: v_mov_b32_e32 v36, s11 +; GFX9-NEXT: v_mov_b32_e32 v37, s10 +; GFX9-NEXT: v_mov_b32_e32 v38, s9 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v39, s7 ; GFX9-NEXT: .LBB59_5: ; %end ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v23, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 -; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 -; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v21 -; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v37, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v36, 16, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v27, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v6, v34, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v32, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v31, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v30, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v24 ; GFX9-NEXT: v_mov_b32_e32 v1, v25 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 9f5c9c4c509ed..38318025fb45a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -168,8 +168,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; SI-LABEL: bitcast_v22i32_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -190,8 +190,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -227,8 +227,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; VI-LABEL: bitcast_v22i32_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -249,8 +249,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB1_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -286,8 +286,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; GFX9-LABEL: bitcast_v22i32_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -308,8 +308,8 @@ define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -560,8 +560,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; SI-LABEL: bitcast_v22f32_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -582,8 +582,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -619,8 +619,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; VI-LABEL: bitcast_v22f32_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -641,8 +641,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB3_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -678,8 +678,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; GFX9-LABEL: bitcast_v22f32_to_v22i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -700,8 +700,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -952,8 +952,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v22i32_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -974,8 +974,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1011,8 +1011,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; VI-LABEL: bitcast_v22i32_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -1033,8 +1033,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB5_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1070,8 +1070,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; GFX9-LABEL: bitcast_v22i32_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -1092,8 +1092,8 @@ define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1361,8 +1361,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v11i64_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -1383,8 +1383,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1420,8 +1420,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; VI-LABEL: bitcast_v11i64_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -1442,8 +1442,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1479,8 +1479,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; GFX9-LABEL: bitcast_v11i64_to_v22i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -1501,8 +1501,8 @@ define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1770,8 +1770,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; SI-LABEL: bitcast_v22i32_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -1792,8 +1792,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1829,8 +1829,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; VI-LABEL: bitcast_v22i32_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -1851,8 +1851,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB9_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1888,8 +1888,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; GFX9-LABEL: bitcast_v22i32_to_v11f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -1910,8 +1910,8 @@ define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -2129,8 +2129,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; SI-LABEL: bitcast_v11f64_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -2140,19 +2140,19 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -2177,8 +2177,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; VI-LABEL: bitcast_v11f64_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -2188,19 +2188,19 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -2225,8 +2225,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; GFX9-LABEL: bitcast_v11f64_to_v22i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -2236,19 +2236,19 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -2331,54 +2331,54 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 @@ -2405,157 +2405,157 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -2566,6 +2566,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -2587,33 +2588,32 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_4 @@ -2640,73 +2640,73 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2714,6 +2714,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -2735,33 +2736,32 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 @@ -2788,53 +2788,53 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44i16: @@ -5024,18 +5024,20 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -5055,12 +5057,10 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -5091,77 +5091,77 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -5255,6 +5255,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -5264,10 +5266,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -5286,23 +5286,22 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -5317,6 +5316,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -5995,6 +5995,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -6016,33 +6017,32 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB16_4 @@ -6069,73 +6069,73 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6143,6 +6143,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -6164,33 +6165,32 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB16_4 @@ -6217,53 +6217,53 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44f16: @@ -6620,27 +6620,26 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: .LBB17_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 ; SI-NEXT: v_or_b32_e32 v49, v49, v50 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v39, v39, v48 @@ -8866,18 +8865,20 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -8897,12 +8898,10 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -8923,13 +8922,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -8993,13 +8992,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v20, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v22, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_or_b32_e32 v20, v23, v22 ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 @@ -9062,6 +9061,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -9071,10 +9072,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -9509,8 +9508,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; SI-LABEL: bitcast_v22f32_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -9531,8 +9530,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -9568,8 +9567,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; VI-LABEL: bitcast_v22f32_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -9590,8 +9589,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB21_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -9627,8 +9626,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; GFX9-LABEL: bitcast_v22f32_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -9649,8 +9648,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -9907,8 +9906,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; SI-LABEL: bitcast_v11i64_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -9929,8 +9928,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -9966,8 +9965,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; VI-LABEL: bitcast_v11i64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -9988,8 +9987,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -10025,8 +10024,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; GFX9-LABEL: bitcast_v11i64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -10047,8 +10046,8 @@ define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -10305,8 +10304,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; SI-LABEL: bitcast_v22f32_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -10327,8 +10326,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -10364,8 +10363,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; VI-LABEL: bitcast_v22f32_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -10386,8 +10385,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -10423,8 +10422,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; GFX9-LABEL: bitcast_v22f32_to_v11f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -10445,8 +10444,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -10653,8 +10652,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; SI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -10664,19 +10663,19 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -10701,8 +10700,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; VI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -10712,19 +10711,19 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -10749,8 +10748,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; GFX9-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -10760,19 +10759,19 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -10855,54 +10854,54 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 @@ -10929,157 +10928,157 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -11090,6 +11089,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -11111,33 +11111,32 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB28_4 @@ -11164,73 +11163,73 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11238,6 +11237,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -11259,33 +11259,32 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB28_4 @@ -11312,53 +11311,53 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16: @@ -11524,61 +11523,61 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s16 ; SI-NEXT: v_mov_b32_e32 v22, s17 ; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_mov_b32_e32 v18, s20 ; SI-NEXT: v_mov_b32_e32 v17, s21 ; SI-NEXT: v_mov_b32_e32 v16, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 ; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 ; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v24, 16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 @@ -11587,57 +11586,57 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 ; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v24, 16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v23, v23, v49 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v24, v24, v49 +; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 @@ -11662,26 +11661,26 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 @@ -11718,7 +11717,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11730,7 +11729,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11761,9 +11760,9 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_branch .LBB29_2 ; @@ -11778,13 +11777,13 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v16, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v14, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB29_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -11797,12 +11796,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -11821,12 +11820,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -11843,12 +11842,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -11858,36 +11857,36 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB29_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v8, v14, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 ; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 ; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -11946,13 +11945,13 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v18, s20 ; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v16, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -11965,12 +11964,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -11989,12 +11988,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -12011,12 +12010,12 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -12025,17 +12024,15 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB29_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 @@ -12060,6 +12057,8 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 @@ -13501,18 +13500,20 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -13532,12 +13533,10 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -13568,77 +13567,77 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -13732,6 +13731,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -13741,10 +13742,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -13763,23 +13762,22 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -13794,6 +13792,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -14472,6 +14471,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -14493,33 +14493,32 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_4 @@ -14546,73 +14545,73 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14620,6 +14619,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -14641,33 +14641,32 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_4 @@ -14694,53 +14693,53 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16: @@ -15090,27 +15089,26 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 ; SI-NEXT: v_or_b32_e32 v49, v49, v50 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v39, v39, v48 @@ -15309,13 +15307,13 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v16, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v14, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB33_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -15328,12 +15326,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -15352,12 +15350,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -15374,12 +15372,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -15389,36 +15387,36 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB33_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v8, v14, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 ; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 ; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15477,13 +15475,13 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v18, s20 ; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: v_mov_b32_e32 v16, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -15496,12 +15494,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -15520,12 +15518,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -15542,12 +15540,12 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 @@ -15556,17 +15554,15 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB33_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 @@ -15591,6 +15587,8 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 @@ -17336,18 +17334,20 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -17367,12 +17367,10 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -17393,13 +17391,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -17463,13 +17461,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v20, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v22, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_or_b32_e32 v20, v23, v22 ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 @@ -17532,6 +17530,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -17541,10 +17541,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -17996,8 +17994,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; SI-LABEL: bitcast_v11i64_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -18018,8 +18016,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -18055,8 +18053,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; VI-LABEL: bitcast_v11i64_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v13, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -18077,8 +18075,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB37_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -18114,8 +18112,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; GFX9-LABEL: bitcast_v11i64_to_v11f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -18136,8 +18134,8 @@ define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -18361,8 +18359,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; SI-LABEL: bitcast_v11f64_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -18372,19 +18370,19 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -18409,8 +18407,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; VI-LABEL: bitcast_v11f64_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -18420,19 +18418,19 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -18457,8 +18455,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; GFX9-LABEL: bitcast_v11f64_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -18468,19 +18466,19 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -18563,54 +18561,54 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 @@ -18637,157 +18635,157 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -18798,6 +18796,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -18819,33 +18818,32 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB40_4 @@ -18872,73 +18870,73 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -18946,6 +18944,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -18967,33 +18966,32 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB40_4 @@ -19020,53 +19018,53 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44i16: @@ -21268,18 +21266,20 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -21299,12 +21299,10 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -21335,77 +21333,77 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -21499,6 +21497,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -21508,10 +21508,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -21530,23 +21528,22 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -21561,6 +21558,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -22239,6 +22237,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -22260,33 +22259,32 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB44_4 @@ -22313,73 +22311,73 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -22387,6 +22385,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -22408,33 +22407,32 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB44_4 @@ -22461,53 +22459,53 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44f16: @@ -22876,27 +22874,26 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 ; SI-NEXT: v_or_b32_e32 v49, v49, v50 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v39, v39, v48 @@ -25122,18 +25119,20 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -25153,12 +25152,10 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -25179,13 +25176,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -25249,13 +25246,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v20, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v22, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_or_b32_e32 v20, v23, v22 ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 @@ -25318,6 +25315,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -25327,10 +25326,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -25619,54 +25616,54 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 @@ -25682,157 +25679,157 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v26, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v50, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -25843,6 +25840,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -25864,33 +25862,32 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_4 @@ -25906,73 +25903,73 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -25980,6 +25977,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -26001,33 +25999,32 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_4 @@ -26043,53 +26040,53 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16: @@ -26255,169 +26252,169 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s17 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_mov_b32_e32 v14, s25 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v9, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v32, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v49, v23, v22, 16 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v9, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v32, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v49, v23, v22, 16 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 ; SI-NEXT: .LBB49_3: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v49 -; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v49 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v39 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 @@ -26450,7 +26447,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -26483,7 +26480,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_branch .LBB49_2 ; @@ -26492,19 +26489,19 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v23, s17 ; VI-NEXT: v_mov_b32_e32 v20, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 ; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v21, s19 ; VI-NEXT: v_mov_b32_e32 v19, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v16, s26 -; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -26518,12 +26515,12 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -26537,9 +26534,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -26553,12 +26550,12 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -26567,40 +26564,40 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v8, v14, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 ; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -26629,9 +26626,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 @@ -26649,19 +26646,19 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v23, s17 ; GFX9-NEXT: v_mov_b32_e32 v20, s18 -; GFX9-NEXT: v_mov_b32_e32 v21, s19 ; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 ; GFX9-NEXT: v_mov_b32_e32 v19, s21 -; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 -; GFX9-NEXT: v_mov_b32_e32 v9, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -26675,12 +26672,12 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -26694,9 +26691,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -26710,12 +26707,12 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -26723,26 +26720,24 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB49_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v14 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 @@ -26758,12 +26753,14 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 @@ -26786,9 +26783,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -28198,18 +28195,20 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -28229,12 +28228,10 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -28265,77 +28262,77 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 @@ -28429,6 +28426,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -28438,10 +28437,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -28460,23 +28457,22 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -28491,6 +28487,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -29147,6 +29144,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 @@ -29168,33 +29166,32 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB52_4 @@ -29210,73 +29207,73 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_or_b32_sdwa v0, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; VI-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_or_b32_sdwa v4, v4, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; VI-NEXT: v_or_b32_sdwa v6, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v7, v7, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; VI-NEXT: v_or_b32_sdwa v8, v8, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_or_b32_sdwa v10, v10, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v14, v14, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; VI-NEXT: v_or_b32_sdwa v15, v15, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; VI-NEXT: v_or_b32_sdwa v16, v16, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v17, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; VI-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -29284,6 +29281,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -29305,33 +29303,32 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB52_4 @@ -29347,53 +29344,53 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v50, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v49, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v48, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v39, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v38, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v37, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v36, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v35, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v34, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v32, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v31, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v30, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v29, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v28, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v27, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v26, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v25, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v24, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v23, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v52, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v51, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v50, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v49, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v39, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v38, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v37, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v36, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v35, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v32, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v31, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v28, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v27, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v26, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v25, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v24, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v23, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16: @@ -29568,37 +29565,38 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_and_b64 s[12:13], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s12, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 ; SI-NEXT: s_lshr_b32 s12, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: s_lshr_b32 s12, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 ; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: s_lshr_b32 s12, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 ; SI-NEXT: s_lshr_b32 s12, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 ; SI-NEXT: s_lshr_b32 s12, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 ; SI-NEXT: s_lshr_b32 s12, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: s_lshr_b32 s12, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 ; SI-NEXT: s_lshr_b32 s12, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 ; SI-NEXT: s_lshr_b32 s12, s26, 16 @@ -29608,13 +29606,13 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: s_lshr_b32 s12, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 ; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 ; SI-NEXT: s_lshr_b32 s12, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 ; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 ; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 ; SI-NEXT: s_lshr_b32 s12, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 ; SI-NEXT: s_lshr_b32 s12, s18, 16 @@ -29622,162 +29620,161 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: s_lshr_b32 s12, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s12 ; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[33:34], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[29:30], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_add_f64 v[52:53], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[30:31], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_i32_e32 v53, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 ; SI-NEXT: v_or_b32_e32 v49, v49, v50 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v39, v39, v48 ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 @@ -29785,133 +29782,133 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -29921,24 +29918,24 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; @@ -29947,19 +29944,19 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v23, s17 ; VI-NEXT: v_mov_b32_e32 v20, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 ; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v21, s19 ; VI-NEXT: v_mov_b32_e32 v19, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v16, s26 -; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -29973,12 +29970,12 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -29992,9 +29989,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -30008,12 +30005,12 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -30022,40 +30019,40 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v8, v14, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 ; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30084,9 +30081,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 @@ -30104,19 +30101,19 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v23, s17 ; GFX9-NEXT: v_mov_b32_e32 v20, s18 -; GFX9-NEXT: v_mov_b32_e32 v21, s19 ; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 ; GFX9-NEXT: v_mov_b32_e32 v19, s21 -; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 -; GFX9-NEXT: v_mov_b32_e32 v9, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -30130,12 +30127,12 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -30149,9 +30146,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -30165,12 +30162,12 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 @@ -30178,26 +30175,24 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB53_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v14 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 @@ -30213,12 +30208,14 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 @@ -30241,9 +30238,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -31957,18 +31954,20 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 @@ -31988,12 +31987,10 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -32014,13 +32011,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -32084,13 +32081,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v20, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v34 +; VI-NEXT: v_add_f16_sdwa v22, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_or_b32_e32 v20, v23, v22 ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 @@ -32153,6 +32150,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -32162,10 +32161,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -33089,15 +33086,15 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v52, 3, v52 ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v51, 3, v51 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 @@ -33142,7 +33139,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 ; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33191,57 +33188,57 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-LABEL: bitcast_v44i16_to_v44f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 -; GFX9-NEXT: v_perm_b32 v20, v50, v20, s6 -; GFX9-NEXT: v_perm_b32 v19, v49, v19, s6 -; GFX9-NEXT: v_perm_b32 v18, v48, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v39, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v38, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v37, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v36, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v35, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v34, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v33, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v32, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v31, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v29, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v28, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v27, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v26, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v25, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v24, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v23, v1, s6 -; GFX9-NEXT: v_perm_b32 v0, v22, v0, s6 +; GFX9-NEXT: v_perm_b32 v21, v52, v21, s6 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: v_perm_b32 v19, v50, v19, s6 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s6 +; GFX9-NEXT: v_perm_b32 v17, v48, v17, s6 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s6 +; GFX9-NEXT: v_perm_b32 v15, v38, v15, s6 +; GFX9-NEXT: v_perm_b32 v14, v37, v14, s6 +; GFX9-NEXT: v_perm_b32 v13, v36, v13, s6 +; GFX9-NEXT: v_perm_b32 v12, v35, v12, s6 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s6 +; GFX9-NEXT: v_perm_b32 v10, v33, v10, s6 +; GFX9-NEXT: v_perm_b32 v9, v32, v9, s6 +; GFX9-NEXT: v_perm_b32 v8, v31, v8, s6 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s6 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s6 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s6 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v24, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v23, v0, s6 ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] @@ -33264,53 +33261,53 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 ; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v23, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v24, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v25, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v26, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v27, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v28, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v31, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v32, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v34, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v35, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v36, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v37, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v38, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v39, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v48, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v49, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v50, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v23, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v24, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v31, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v32, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v35, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v36, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v37, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v38, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v48, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v50, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v52, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16: @@ -33515,72 +33512,72 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s21 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s27 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -33604,7 +33601,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 @@ -33656,23 +33653,23 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -33681,23 +33678,23 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -33706,30 +33703,30 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -33759,7 +33756,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -33777,7 +33774,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -33809,7 +33806,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -33834,36 +33831,36 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -33887,61 +33884,61 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: @@ -34662,427 +34659,444 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v30 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v57 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v58 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v62 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v20, v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v12, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v31, v26, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_or_b32_e32 v10, v10, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_or_b32_e32 v17, v17, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v31 +; SI-NEXT: v_or_b32_e32 v32, v27, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_or_b32_e32 v34, v26, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v27 +; SI-NEXT: v_or_b32_e32 v35, v26, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v41 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v37, v26, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 +; SI-NEXT: v_or_b32_e32 v38, v26, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v39 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v48, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_or_b32_e32 v49, v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v52, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v51, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 -; SI-NEXT: v_or_b32_e32 v50, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v39, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v50, v27, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v55 +; SI-NEXT: v_or_b32_e32 v39, v33, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v36, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v33, v33, v30 -; SI-NEXT: v_or_b32_e32 v21, v21, v52 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_or_b32_e32 v11, v11, v18 -; SI-NEXT: v_alignbit_b32 v41, v48, v26, 16 -; SI-NEXT: v_alignbit_b32 v40, v37, v27, 16 -; SI-NEXT: v_alignbit_b32 v55, v34, v28, 16 -; SI-NEXT: v_alignbit_b32 v54, v31, v29, 16 -; SI-NEXT: v_alignbit_b32 v53, v19, v30, 16 -; SI-NEXT: v_alignbit_b32 v52, v15, v52, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 -; SI-NEXT: v_alignbit_b32 v22, v9, v22, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v9, v3, v28, 16 +; SI-NEXT: v_or_b32_e32 v36, v36, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v21, v21, v53 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_or_b32_e32 v25, v25, v15 +; SI-NEXT: v_alignbit_b32 v42, v52, v26, 16 +; SI-NEXT: v_alignbit_b32 v41, v49, v29, 16 +; SI-NEXT: v_alignbit_b32 v40, v38, v27, 16 +; SI-NEXT: v_alignbit_b32 v55, v35, v30, 16 +; SI-NEXT: v_alignbit_b32 v54, v32, v33, 16 +; SI-NEXT: v_alignbit_b32 v53, v20, v53, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v24, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v23, 16 +; SI-NEXT: v_alignbit_b32 v22, v10, v22, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v15, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v48 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v55 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35110,15 +35124,15 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v52, 0x200, v52 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_add_f16_e32 v51, 0x200, v51 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 @@ -35163,7 +35177,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 ; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 ; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -35212,58 +35226,58 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-LABEL: bitcast_v44f16_to_v44i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 +; GFX9-NEXT: v_perm_b32 v21, v52, v21, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v20, v50, v20, s6 -; GFX9-NEXT: v_perm_b32 v19, v49, v19, s6 -; GFX9-NEXT: v_perm_b32 v18, v48, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v39, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v38, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v37, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v36, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v35, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v34, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v33, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v32, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v31, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v29, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v28, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v27, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v26, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v25, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v24, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v23, v1, s6 -; GFX9-NEXT: v_perm_b32 v0, v22, v0, s6 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: v_perm_b32 v19, v50, v19, s6 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s6 +; GFX9-NEXT: v_perm_b32 v17, v48, v17, s6 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s6 +; GFX9-NEXT: v_perm_b32 v15, v38, v15, s6 +; GFX9-NEXT: v_perm_b32 v14, v37, v14, s6 +; GFX9-NEXT: v_perm_b32 v13, v36, v13, s6 +; GFX9-NEXT: v_perm_b32 v12, v35, v12, s6 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s6 +; GFX9-NEXT: v_perm_b32 v10, v33, v10, s6 +; GFX9-NEXT: v_perm_b32 v9, v32, v9, s6 +; GFX9-NEXT: v_perm_b32 v8, v31, v8, s6 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s6 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s6 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s6 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v24, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v23, v0, s6 ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] @@ -35286,53 +35300,53 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 ; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v23, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v24, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v25, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v26, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v27, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v28, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v29, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v31, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v32, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v33, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v34, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v35, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v36, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v37, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v38, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v39, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v48, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v49, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v50, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4 +; GFX9-NEXT: v_perm_b32 v0, v23, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v24, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v26, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v29, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v30, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v31, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v32, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v33, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v35, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v36, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v37, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v38, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v48, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v50, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v52, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16: @@ -35515,368 +35529,399 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v3, v3, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; SI-NEXT: v_or_b32_e32 v10, v10, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 +; SI-NEXT: v_or_b32_e32 v32, v18, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v9, v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_or_b32_e32 v35, v17, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 +; SI-NEXT: v_or_b32_e32 v38, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v49, v17, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v52, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_or_b32_e32 v40, v17, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v50 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_or_b32_e32 v13, v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v44, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v42, v18, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v31, v31, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_or_b32_e32 v41, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_or_b32_e32 v54, v22, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v50, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v34, v34, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v38 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v39, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_or_b32_e32 v36, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v37, v37, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v18, v18, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 -; SI-NEXT: v_or_b32_e32 v24, v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v27, v27, v52 -; SI-NEXT: v_or_b32_e32 v26, v26, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_or_b32_e32 v20, v20, v28 -; SI-NEXT: v_or_b32_e32 v39, v39, v51 -; SI-NEXT: v_or_b32_e32 v36, v36, v50 -; SI-NEXT: v_or_b32_e32 v33, v33, v49 -; SI-NEXT: v_or_b32_e32 v15, v15, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_alignbit_b32 v52, v24, v52, 16 -; SI-NEXT: v_alignbit_b32 v30, v21, v30, 16 -; SI-NEXT: v_alignbit_b32 v29, v18, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v37, v28, 16 -; SI-NEXT: v_alignbit_b32 v51, v34, v51, 16 -; SI-NEXT: v_alignbit_b32 v50, v31, v50, 16 -; SI-NEXT: v_alignbit_b32 v49, v13, v49, 16 -; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 -; SI-NEXT: v_alignbit_b32 v17, v6, v17, 16 -; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v12, v1, v12, 16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v9, v3, v19, 16 +; SI-NEXT: v_or_b32_e32 v33, v26, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_alignbit_b32 v58, v44, v17, 16 +; SI-NEXT: v_alignbit_b32 v57, v40, v20, 16 +; SI-NEXT: v_alignbit_b32 v56, v52, v18, 16 +; SI-NEXT: v_alignbit_b32 v47, v49, v21, 16 +; SI-NEXT: v_alignbit_b32 v46, v38, v22, 16 +; SI-NEXT: v_alignbit_b32 v45, v35, v23, 16 +; SI-NEXT: v_alignbit_b32 v43, v32, v24, 16 +; SI-NEXT: v_alignbit_b32 v53, v13, v25, 16 +; SI-NEXT: v_alignbit_b32 v16, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v15, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v19 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v27, v27, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v57 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v47 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 152a48bec2636..20b41ec0169b7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -176,7 +176,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; SI-LABEL: bitcast_v24i32_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -197,8 +198,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -238,7 +239,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; VI-LABEL: bitcast_v24i32_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -259,8 +261,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -300,7 +302,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; GFX9-LABEL: bitcast_v24i32_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -321,8 +324,8 @@ define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -587,7 +590,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; SI-LABEL: bitcast_v24f32_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -608,8 +612,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -649,7 +653,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; VI-LABEL: bitcast_v24f32_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -670,8 +675,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -711,7 +716,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; GFX9-LABEL: bitcast_v24f32_to_v24i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -732,8 +738,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -998,7 +1004,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v24i32_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -1019,8 +1026,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -1060,7 +1067,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; VI-LABEL: bitcast_v24i32_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -1081,8 +1089,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -1122,7 +1130,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; GFX9-LABEL: bitcast_v24i32_to_v12i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -1143,8 +1152,8 @@ define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -1427,7 +1436,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v12i64_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -1448,8 +1458,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -1489,7 +1499,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; VI-LABEL: bitcast_v12i64_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -1510,8 +1521,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -1551,7 +1562,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; GFX9-LABEL: bitcast_v12i64_to_v24i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -1572,8 +1584,8 @@ define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -1856,7 +1868,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; SI-LABEL: bitcast_v24i32_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -1877,8 +1890,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -1918,7 +1931,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; VI-LABEL: bitcast_v24i32_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -1939,8 +1953,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -1980,7 +1994,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; GFX9-LABEL: bitcast_v24i32_to_v12f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -2001,8 +2016,8 @@ define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -2231,7 +2246,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; SI-LABEL: bitcast_v12f64_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -2243,16 +2259,16 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 @@ -2281,7 +2297,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; VI-LABEL: bitcast_v12f64_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -2293,16 +2310,16 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 @@ -2331,7 +2348,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; GFX9-LABEL: bitcast_v12f64_to_v24i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -2343,16 +2361,16 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 @@ -5449,51 +5467,51 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5655,7 +5673,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -5670,6 +5687,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -7058,7 +7076,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 @@ -7099,7 +7117,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 @@ -7173,7 +7191,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 @@ -7199,7 +7217,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s72 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 @@ -7344,24 +7362,24 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 @@ -7428,10 +7446,10 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -9723,51 +9741,51 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -9791,64 +9809,64 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; VI-NEXT: v_add_f16_e32 v1, s16, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s41 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s41 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s18, v13 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_e32 v4, s18, v13 +; VI-NEXT: v_add_f16_e32 v5, s19, v13 ; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s19, v13 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_mov_b32_e32 v4, s15 -; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, s20, v13 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_e32 v6, s20, v13 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s21, v13 ; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, s21, v13 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_or_b32_e32 v4, v6, v4 ; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, s22, v13 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_e32 v8, s22, v13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s23, v13 ; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, s23, v13 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 ; VI-NEXT: v_mov_b32_e32 v8, s11 -; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, s24, v13 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_or_b32_e32 v7, v9, v7 ; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_e32 v10, s24, v13 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s25, v13 ; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, s25, v13 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_mov_b32_e32 v10, s9 -; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, s26, v13 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_or_b32_e32 v9, v11, v9 ; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_e32 v12, s26, v13 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s27, v13 ; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s27, v13 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_mov_b32_e32 v12, s7 -; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, s28, v13 -; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_or_b32_e32 v11, v14, v11 ; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_e32 v15, s28, v13 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, s29, v13 -; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v12, v15, v12 +; VI-NEXT: v_add_f16_sdwa v15, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v16, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_or_b32_e32 v15, v17, v16 ; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v16, v17, v16 @@ -9892,7 +9910,6 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -9907,6 +9924,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -10431,7 +10449,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; SI-LABEL: bitcast_v24f32_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -10452,8 +10471,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -10493,7 +10512,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; VI-LABEL: bitcast_v24f32_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -10514,8 +10534,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -10555,7 +10575,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; GFX9-LABEL: bitcast_v24f32_to_v12i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -10576,8 +10597,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -10848,7 +10869,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; SI-LABEL: bitcast_v12i64_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -10869,8 +10891,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -10910,7 +10932,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; VI-LABEL: bitcast_v12i64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -10931,8 +10954,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -10972,7 +10995,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; GFX9-LABEL: bitcast_v12i64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -10993,8 +11017,8 @@ define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -11265,7 +11289,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; SI-LABEL: bitcast_v24f32_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -11286,8 +11311,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -11327,7 +11352,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; VI-LABEL: bitcast_v24f32_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -11348,8 +11374,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -11389,7 +11415,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; GFX9-LABEL: bitcast_v24f32_to_v12f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -11410,8 +11437,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -11628,7 +11655,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; SI-LABEL: bitcast_v12f64_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -11640,16 +11668,16 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 @@ -11678,7 +11706,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; VI-LABEL: bitcast_v12f64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -11690,16 +11719,16 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 @@ -11728,7 +11757,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; GFX9-LABEL: bitcast_v12f64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -11740,16 +11770,16 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 @@ -12564,17 +12594,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v26, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v25, s17 ; SI-NEXT: v_mov_b32_e32 v23, s18 ; SI-NEXT: v_mov_b32_e32 v22, s19 ; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_mov_b32_e32 v19, s21 ; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s26 ; SI-NEXT: v_mov_b32_e32 v13, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v11, s29 @@ -12582,17 +12612,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 -; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v14, v16, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_alignbit_b32 v53, v25, v26, 16 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 @@ -12600,26 +12630,26 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 @@ -12633,17 +12663,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 -; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v14, v16, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_alignbit_b32 v53, v25, v26, 16 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 @@ -12651,29 +12681,29 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 ; SI-NEXT: .LBB29_3: ; %end ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; SI-NEXT: v_or_b32_e32 v26, v26, v53 ; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 @@ -12698,26 +12728,26 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v18, v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 @@ -12778,7 +12808,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -12824,7 +12854,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -12834,18 +12864,18 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v23, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v21, s18 -; VI-NEXT: v_mov_b32_e32 v19, s19 -; VI-NEXT: v_mov_b32_e32 v17, s20 -; VI-NEXT: v_mov_b32_e32 v15, s21 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v11, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v22, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v20, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v11, s26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v20, s26 ; VI-NEXT: v_mov_b32_e32 v18, s27 ; VI-NEXT: v_mov_b32_e32 v16, s28 ; VI-NEXT: v_mov_b32_e32 v14, s29 @@ -12864,17 +12894,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; VI-NEXT: s_cbranch_execnz .LBB29_3 ; VI-NEXT: .LBB29_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -12890,17 +12920,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 @@ -12914,59 +12944,59 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; VI-NEXT: .LBB29_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v24, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -12990,13 +13020,13 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 @@ -13016,18 +13046,18 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_mov_b32_e32 v23, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v21, s18 -; GFX9-NEXT: v_mov_b32_e32 v19, s19 -; GFX9-NEXT: v_mov_b32_e32 v17, s20 -; GFX9-NEXT: v_mov_b32_e32 v15, s21 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v11, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v22, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v20, s26 ; GFX9-NEXT: v_mov_b32_e32 v18, s27 ; GFX9-NEXT: v_mov_b32_e32 v16, s28 ; GFX9-NEXT: v_mov_b32_e32 v14, s29 @@ -13046,17 +13076,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; GFX9-NEXT: s_cbranch_execnz .LBB29_3 ; GFX9-NEXT: .LBB29_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -13072,17 +13102,17 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 @@ -13096,62 +13126,62 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; GFX9-NEXT: .LBB29_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v24 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 ; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 @@ -13172,13 +13202,13 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 @@ -14791,51 +14821,51 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -14997,7 +15027,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -15012,6 +15041,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -16388,7 +16418,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 @@ -16428,7 +16458,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 @@ -16456,7 +16486,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 @@ -16471,7 +16501,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 ; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 ; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 ; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s9, 1.0 @@ -16480,7 +16510,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 @@ -16499,7 +16529,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: s_waitcnt expcnt(1) @@ -16509,7 +16539,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -16524,7 +16554,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v3 @@ -16535,7 +16565,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 @@ -16678,24 +16708,24 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 @@ -16774,10 +16804,10 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -16791,18 +16821,18 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v23, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v21, s18 -; VI-NEXT: v_mov_b32_e32 v19, s19 -; VI-NEXT: v_mov_b32_e32 v17, s20 -; VI-NEXT: v_mov_b32_e32 v15, s21 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v11, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v22, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v20, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v11, s26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v20, s26 ; VI-NEXT: v_mov_b32_e32 v18, s27 ; VI-NEXT: v_mov_b32_e32 v16, s28 ; VI-NEXT: v_mov_b32_e32 v14, s29 @@ -16821,17 +16851,17 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; VI-NEXT: s_cbranch_execnz .LBB33_3 ; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -16847,17 +16877,17 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 @@ -16871,59 +16901,59 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; VI-NEXT: .LBB33_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v24, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16947,13 +16977,13 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr10 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 @@ -16973,18 +17003,18 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_mov_b32_e32 v23, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v21, s18 -; GFX9-NEXT: v_mov_b32_e32 v19, s19 -; GFX9-NEXT: v_mov_b32_e32 v17, s20 -; GFX9-NEXT: v_mov_b32_e32 v15, s21 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v11, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v22, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v20, s26 ; GFX9-NEXT: v_mov_b32_e32 v18, s27 ; GFX9-NEXT: v_mov_b32_e32 v16, s28 ; GFX9-NEXT: v_mov_b32_e32 v14, s29 @@ -17003,17 +17033,17 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; GFX9-NEXT: s_cbranch_execnz .LBB33_3 ; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -17029,17 +17059,17 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 @@ -17053,62 +17083,62 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; GFX9-NEXT: .LBB33_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v24 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 ; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 @@ -17129,13 +17159,13 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr10 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 @@ -19055,51 +19085,51 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -19123,64 +19153,64 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; VI-NEXT: v_add_f16_e32 v1, s16, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s41 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s41 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s18, v13 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_e32 v4, s18, v13 +; VI-NEXT: v_add_f16_e32 v5, s19, v13 ; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s19, v13 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_mov_b32_e32 v4, s15 -; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, s20, v13 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_e32 v6, s20, v13 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s21, v13 ; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, s21, v13 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_or_b32_e32 v4, v6, v4 ; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, s22, v13 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_e32 v8, s22, v13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s23, v13 ; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, s23, v13 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 ; VI-NEXT: v_mov_b32_e32 v8, s11 -; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, s24, v13 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_or_b32_e32 v7, v9, v7 ; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_e32 v10, s24, v13 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s25, v13 ; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, s25, v13 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_mov_b32_e32 v10, s9 -; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, s26, v13 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_or_b32_e32 v9, v11, v9 ; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_e32 v12, s26, v13 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s27, v13 ; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s27, v13 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_mov_b32_e32 v12, s7 -; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, s28, v13 -; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_or_b32_e32 v11, v14, v11 ; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_e32 v15, s28, v13 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, s29, v13 -; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v12, v15, v12 +; VI-NEXT: v_add_f16_sdwa v15, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v16, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_or_b32_e32 v15, v17, v16 ; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v16, v17, v16 @@ -19224,7 +19254,6 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -19239,6 +19268,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -19781,7 +19811,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; SI-LABEL: bitcast_v12i64_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -19802,8 +19833,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 @@ -19843,7 +19874,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; VI-LABEL: bitcast_v12i64_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -19864,8 +19896,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 @@ -19905,7 +19937,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; GFX9-LABEL: bitcast_v12i64_to_v12f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -19926,8 +19959,8 @@ define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 @@ -20162,7 +20195,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; SI-LABEL: bitcast_v12f64_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 ; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v21, v7 @@ -20174,16 +20208,16 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 @@ -20212,7 +20246,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; VI-LABEL: bitcast_v12f64_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 @@ -20224,16 +20259,16 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 @@ -20262,7 +20297,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; GFX9-LABEL: bitcast_v12f64_to_v12i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 ; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 @@ -20274,16 +20310,16 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 @@ -23392,51 +23428,51 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -23598,7 +23634,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -23613,6 +23648,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -25013,7 +25049,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 @@ -25054,7 +25090,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 @@ -25128,7 +25164,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 @@ -25154,7 +25190,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s72 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 @@ -25299,24 +25335,24 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 @@ -25383,10 +25419,10 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -27678,51 +27714,51 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -27746,64 +27782,64 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; VI-NEXT: v_add_f16_e32 v1, s16, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s41 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s41 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s18, v13 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_e32 v4, s18, v13 +; VI-NEXT: v_add_f16_e32 v5, s19, v13 ; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s19, v13 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_mov_b32_e32 v4, s15 -; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, s20, v13 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_e32 v6, s20, v13 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s21, v13 ; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, s21, v13 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_or_b32_e32 v4, v6, v4 ; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, s22, v13 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_e32 v8, s22, v13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s23, v13 ; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, s23, v13 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 ; VI-NEXT: v_mov_b32_e32 v8, s11 -; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, s24, v13 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_or_b32_e32 v7, v9, v7 ; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_e32 v10, s24, v13 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s25, v13 ; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, s25, v13 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_mov_b32_e32 v10, s9 -; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, s26, v13 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_or_b32_e32 v9, v11, v9 ; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_e32 v12, s26, v13 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s27, v13 ; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s27, v13 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_mov_b32_e32 v12, s7 -; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, s28, v13 -; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_or_b32_e32 v11, v14, v11 ; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_e32 v15, s28, v13 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, s29, v13 -; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v12, v15, v12 +; VI-NEXT: v_add_f16_sdwa v15, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v16, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_or_b32_e32 v15, v17, v16 ; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v16, v17, v16 @@ -27847,7 +27883,6 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -27862,6 +27897,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -28609,8 +28645,8 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v8, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28734,8 +28770,8 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v8, v39, v8, s4 ; GFX9-NEXT: v_perm_b32 v9, v38, v9, s4 ; GFX9-NEXT: v_perm_b32 v10, v37, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v36, v11, s4 ; GFX9-NEXT: v_perm_b32 v12, v35, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v36, v11, s4 ; GFX9-NEXT: v_perm_b32 v13, v34, v13, s4 ; GFX9-NEXT: v_perm_b32 v14, v33, v14, s4 ; GFX9-NEXT: v_perm_b32 v15, v32, v15, s4 @@ -28923,17 +28959,17 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 ; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_mov_b32_e32 v15, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s29 @@ -28946,8 +28982,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 ; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 @@ -28958,8 +28994,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 @@ -28971,8 +29007,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 @@ -28985,8 +29021,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 ; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 @@ -28997,8 +29033,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 @@ -29051,26 +29087,26 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 @@ -29182,20 +29218,20 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v11, s26 ; VI-NEXT: v_mov_b32_e32 v24, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v16, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v21, s24 -; VI-NEXT: v_mov_b32_e32 v22, s25 -; VI-NEXT: v_mov_b32_e32 v17, s26 -; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v22, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v12, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -29208,18 +29244,18 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB49_3 @@ -29229,12 +29265,12 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -29246,18 +29282,18 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: .LBB49_3: ; %end @@ -29266,42 +29302,42 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -29352,20 +29388,20 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v24, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v16, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v21, s24 -; GFX9-NEXT: v_mov_b32_e32 v22, s25 -; GFX9-NEXT: v_mov_b32_e32 v17, s26 -; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v22, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -29378,18 +29414,18 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 @@ -29399,12 +29435,12 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -29416,60 +29452,60 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB49_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 @@ -31114,51 +31150,51 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -31320,7 +31356,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -31335,6 +31370,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -32313,8 +32349,8 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v8, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -32438,8 +32474,8 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v8, v39, v8, s4 ; GFX9-NEXT: v_perm_b32 v9, v38, v9, s4 ; GFX9-NEXT: v_perm_b32 v10, v37, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v36, v11, s4 ; GFX9-NEXT: v_perm_b32 v12, v35, v12, s4 +; GFX9-NEXT: v_perm_b32 v11, v36, v11, s4 ; GFX9-NEXT: v_perm_b32 v13, v34, v13, s4 ; GFX9-NEXT: v_perm_b32 v14, v33, v14, s4 ; GFX9-NEXT: v_perm_b32 v15, v32, v15, s4 @@ -32664,7 +32700,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: s_lshr_b32 s14, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 ; SI-NEXT: s_lshr_b32 s14, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 ; SI-NEXT: s_lshr_b32 s14, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 ; SI-NEXT: s_lshr_b32 s14, s10, 16 @@ -32704,7 +32740,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 @@ -32736,7 +32772,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_add_f64 v[22:23], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 @@ -32758,11 +32794,11 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 @@ -32773,9 +32809,9 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 @@ -32799,8 +32835,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v47 @@ -32942,24 +32978,24 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 @@ -33039,10 +33075,10 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -33057,20 +33093,20 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v11, s26 ; VI-NEXT: v_mov_b32_e32 v24, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v16, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v21, s24 -; VI-NEXT: v_mov_b32_e32 v22, s25 -; VI-NEXT: v_mov_b32_e32 v17, s26 -; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v22, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v12, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -33083,18 +33119,18 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB53_3 @@ -33104,12 +33140,12 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -33121,18 +33157,18 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: .LBB53_3: ; %end @@ -33141,42 +33177,42 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33227,20 +33263,20 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v24, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v16, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v21, s24 -; GFX9-NEXT: v_mov_b32_e32 v22, s25 -; GFX9-NEXT: v_mov_b32_e32 v17, s26 -; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v22, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -33253,18 +33289,18 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 @@ -33274,12 +33310,12 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -33291,60 +33327,60 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB53_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 @@ -35296,51 +35332,51 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -35364,64 +35400,64 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; VI-NEXT: v_add_f16_e32 v1, s16, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s41 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s41 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s18, v13 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_e32 v4, s18, v13 +; VI-NEXT: v_add_f16_e32 v5, s19, v13 ; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, s19, v13 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_mov_b32_e32 v4, s15 -; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, s20, v13 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_e32 v6, s20, v13 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s21, v13 ; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, s21, v13 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_or_b32_e32 v4, v6, v4 ; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, s22, v13 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_e32 v8, s22, v13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s23, v13 ; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, s23, v13 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 ; VI-NEXT: v_mov_b32_e32 v8, s11 -; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, s24, v13 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_or_b32_e32 v7, v9, v7 ; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_e32 v10, s24, v13 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s25, v13 ; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, s25, v13 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_mov_b32_e32 v10, s9 -; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, s26, v13 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_or_b32_e32 v9, v11, v9 ; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_e32 v12, s26, v13 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s27, v13 ; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, s27, v13 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_mov_b32_e32 v12, s7 -; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, s28, v13 -; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_or_b32_e32 v11, v14, v11 ; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_e32 v15, s28, v13 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v13, s29, v13 -; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v12, v15, v12 +; VI-NEXT: v_add_f16_sdwa v15, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v16, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_or_b32_e32 v15, v17, v16 ; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v16, v17, v16 @@ -35465,7 +35501,6 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v39, v2 ; GFX9-NEXT: v_mov_b32_e32 v48, v1 ; GFX9-NEXT: v_mov_b32_e32 v49, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_lshr_b32 s40, s29, 16 ; GFX9-NEXT: s_lshr_b32 s41, s28, 16 ; GFX9-NEXT: s_lshr_b32 s42, s27, 16 @@ -35480,6 +35515,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -36577,10 +36613,10 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -36593,11 +36629,11 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v54, 3, v54 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_add_u16_e32 v53, 3, v53 +; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_add_u16_e32 v52, 3, v52 +; VI-NEXT: v_add_u16_e32 v53, 3, v53 ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_add_u16_e32 v52, 3, v52 ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_add_u16_e32 v51, 3, v51 ; VI-NEXT: v_add_u16_e32 v6, 3, v6 @@ -36639,6 +36675,10 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; VI-NEXT: v_or_b32_sdwa v3, v3, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; VI-NEXT: v_or_b32_sdwa v5, v5, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -36652,8 +36692,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 ; VI-NEXT: v_or_b32_sdwa v10, v10, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 ; VI-NEXT: v_or_b32_sdwa v12, v12, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 @@ -36677,14 +36715,12 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; VI-NEXT: v_or_b32_sdwa v22, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 ; VI-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -36693,28 +36729,28 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36724,101 +36760,101 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 ; GFX9-NEXT: v_perm_b32 v22, v54, v22, s6 -; GFX9-NEXT: v_perm_b32 v21, v53, v21, s6 -; GFX9-NEXT: v_perm_b32 v20, v52, v20, s6 -; GFX9-NEXT: v_perm_b32 v19, v51, v19, s6 -; GFX9-NEXT: v_perm_b32 v18, v50, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v49, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v48, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v39, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v38, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v37, v13, s6 -; GFX9-NEXT: v_perm_b32 v12, v36, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v35, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v34, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v33, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v31, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v30, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v29, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v27, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v26, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v25, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v52, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v53, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v39, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v49, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v50, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v51, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v38, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v37, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v33, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v32, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v36, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v31, v15, s6 +; GFX9-NEXT: v_perm_b32 v16, v30, v16, s6 +; GFX9-NEXT: v_perm_b32 v17, v29, v17, s6 +; GFX9-NEXT: v_perm_b32 v18, v28, v18, s6 +; GFX9-NEXT: v_perm_b32 v19, v27, v19, s6 +; GFX9-NEXT: v_perm_b32 v20, v26, v20, s6 +; GFX9-NEXT: v_perm_b32 v21, v25, v21, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v25, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v26, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v27, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v29, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v31, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v32, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v33, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v35, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v36, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v37, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v38, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v39, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v48, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v49, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v50, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v51, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v52, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v53, v21, s4 +; GFX9-NEXT: v_perm_b32 v1, v52, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v53, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v39, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v49, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v50, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v51, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v38, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v37, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v33, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v32, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v36, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v31, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v30, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v29, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v28, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v27, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v26, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v25, v21, s4 ; GFX9-NEXT: v_perm_b32 v22, v54, v22, s4 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37043,71 +37079,71 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v41 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s18 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s21 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v40 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v54 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 @@ -37140,7 +37176,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 @@ -37160,7 +37196,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -37230,47 +37266,47 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v45, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -37279,16 +37315,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -37325,7 +37361,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -37392,48 +37428,48 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -37447,15 +37483,15 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -37463,9 +37499,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -37492,42 +37528,42 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -37759,9 +37795,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 ; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 ; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 ; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 @@ -37777,41 +37813,41 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 ; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 ; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 -; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 @@ -37822,10 +37858,10 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -37842,8 +37878,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v28, s24 ; GFX9-NEXT: v_mov_b32_e32 v27, s23 @@ -37854,10 +37890,10 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v32, s18 ; GFX9-NEXT: v_mov_b32_e32 v31, s17 ; GFX9-NEXT: v_mov_b32_e32 v30, s16 -; GFX9-NEXT: v_mov_b32_e32 v34, s43 -; GFX9-NEXT: v_mov_b32_e32 v35, s42 -; GFX9-NEXT: v_mov_b32_e32 v36, s41 -; GFX9-NEXT: v_mov_b32_e32 v37, s40 +; GFX9-NEXT: v_mov_b32_e32 v37, s43 +; GFX9-NEXT: v_mov_b32_e32 v36, s42 +; GFX9-NEXT: v_mov_b32_e32 v35, s41 +; GFX9-NEXT: v_mov_b32_e32 v34, s40 ; GFX9-NEXT: v_mov_b32_e32 v38, s15 ; GFX9-NEXT: v_mov_b32_e32 v39, s14 ; GFX9-NEXT: v_mov_b32_e32 v48, s13 @@ -37906,16 +37942,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 ; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 ; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v10, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v35, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v37, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v30 ; GFX9-NEXT: v_mov_b32_e32 v1, v31 @@ -38843,10 +38879,10 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -38859,11 +38895,11 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v53, 0x200, v53 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_add_f16_e32 v52, 0x200, v52 +; VI-NEXT: v_add_f16_e32 v53, 0x200, v53 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v52, 0x200, v52 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 ; VI-NEXT: v_add_f16_e32 v51, 0x200, v51 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -38905,6 +38941,10 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; VI-NEXT: v_or_b32_sdwa v3, v3, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; VI-NEXT: v_or_b32_sdwa v5, v5, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -38918,8 +38958,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v9, v9, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 ; VI-NEXT: v_or_b32_sdwa v10, v10, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 -; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 ; VI-NEXT: v_or_b32_sdwa v12, v12, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 @@ -38943,14 +38981,12 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; VI-NEXT: v_or_b32_sdwa v22, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 ; VI-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -38959,28 +38995,28 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -38990,102 +39026,102 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 ; GFX9-NEXT: v_perm_b32 v22, v54, v22, s6 -; GFX9-NEXT: v_perm_b32 v21, v53, v21, s6 -; GFX9-NEXT: v_perm_b32 v20, v52, v20, s6 -; GFX9-NEXT: v_perm_b32 v19, v51, v19, s6 -; GFX9-NEXT: v_perm_b32 v18, v50, v18, s6 -; GFX9-NEXT: v_perm_b32 v17, v49, v17, s6 -; GFX9-NEXT: v_perm_b32 v16, v48, v16, s6 -; GFX9-NEXT: v_perm_b32 v15, v39, v15, s6 -; GFX9-NEXT: v_perm_b32 v14, v38, v14, s6 -; GFX9-NEXT: v_perm_b32 v13, v37, v13, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v12, v36, v12, s6 -; GFX9-NEXT: v_perm_b32 v11, v35, v11, s6 -; GFX9-NEXT: v_perm_b32 v10, v34, v10, s6 -; GFX9-NEXT: v_perm_b32 v9, v33, v9, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v8, s6 -; GFX9-NEXT: v_perm_b32 v7, v31, v7, s6 -; GFX9-NEXT: v_perm_b32 v6, v30, v6, s6 -; GFX9-NEXT: v_perm_b32 v5, v28, v5, s6 -; GFX9-NEXT: v_perm_b32 v4, v29, v4, s6 -; GFX9-NEXT: v_perm_b32 v3, v27, v3, s6 -; GFX9-NEXT: v_perm_b32 v2, v26, v2, s6 -; GFX9-NEXT: v_perm_b32 v1, v25, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v52, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v53, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v39, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v49, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v50, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v51, v7, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v8, v38, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v37, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v33, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v32, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v36, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v31, v15, s6 +; GFX9-NEXT: v_perm_b32 v16, v30, v16, s6 +; GFX9-NEXT: v_perm_b32 v17, v29, v17, s6 +; GFX9-NEXT: v_perm_b32 v18, v28, v18, s6 +; GFX9-NEXT: v_perm_b32 v19, v27, v19, s6 +; GFX9-NEXT: v_perm_b32 v20, v26, v20, s6 +; GFX9-NEXT: v_perm_b32 v21, v25, v21, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v25, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v26, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v27, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v29, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v28, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v30, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v31, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v32, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v33, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v34, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v35, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v36, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v37, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v38, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v39, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v48, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v49, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v50, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v51, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v52, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v53, v21, s4 +; GFX9-NEXT: v_perm_b32 v1, v52, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v53, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v39, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v48, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v49, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v50, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v51, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v38, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v37, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v35, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v34, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v33, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v32, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v36, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v31, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v30, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v29, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v28, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v27, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v26, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v25, v21, s4 ; GFX9-NEXT: v_perm_b32 v22, v54, v22, s4 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -39282,113 +39318,115 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v17, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v30 +; SI-NEXT: v_or_b32_e32 v51, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -39396,13 +39434,13 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -39420,48 +39458,48 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_or_b32_e32 v11, v11, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 @@ -39472,36 +39510,39 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v10, v10, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v14, v14, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v35, v35, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_or_b32_e32 v35, v35, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v34, v34, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_or_b32_e32 v34, v34, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 @@ -39509,11 +39550,10 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v38, v38, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v38, v38, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 @@ -39524,67 +39564,67 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_or_b32_e32 v23, v23, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v53 -; SI-NEXT: v_or_b32_e32 v22, v22, v50 -; SI-NEXT: v_or_b32_e32 v25, v25, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v41 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_or_b32_e32 v48, v48, v54 -; SI-NEXT: v_or_b32_e32 v39, v39, v42 -; SI-NEXT: v_or_b32_e32 v32, v32, v52 -; SI-NEXT: v_or_b32_e32 v31, v31, v51 +; SI-NEXT: v_or_b32_e32 v49, v49, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_or_b32_e32 v22, v22, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v55 +; SI-NEXT: v_or_b32_e32 v50, v50, v52 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v21, v21, v29 +; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_or_b32_e32 v48, v48, v30 +; SI-NEXT: v_or_b32_e32 v39, v39, v40 +; SI-NEXT: v_or_b32_e32 v32, v32, v54 +; SI-NEXT: v_or_b32_e32 v31, v31, v53 ; SI-NEXT: v_or_b32_e32 v15, v15, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 ; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_alignbit_b32 v40, v22, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v23, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v28, v49, v28, 16 -; SI-NEXT: v_alignbit_b32 v55, v38, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v34, v42, 16 -; SI-NEXT: v_alignbit_b32 v53, v35, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v50, v11, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v5, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v44, 16 +; SI-NEXT: v_alignbit_b32 v42, v50, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v22, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v18, v29, 16 +; SI-NEXT: v_alignbit_b32 v26, v49, v26, 16 +; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 +; SI-NEXT: v_alignbit_b32 v40, v34, v40, 16 +; SI-NEXT: v_alignbit_b32 v55, v35, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v52, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v2, v44, 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v29, v20 +; SI-NEXT: v_add_i32_e32 v29, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v28 +; SI-NEXT: v_or_b32_e32 v20, v20, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 @@ -39595,7 +39635,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v26 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen @@ -39607,7 +39647,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v41 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen @@ -39619,7 +39659,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen @@ -39631,7 +39671,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen @@ -39643,7 +39683,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 ; SI-NEXT: v_or_b32_e32 v12, v12, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen @@ -39655,7 +39695,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 @@ -39666,7 +39706,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen @@ -39678,7 +39718,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen @@ -39690,7 +39730,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -39699,11 +39739,14 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: @@ -39887,20 +39930,20 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; GFX9-LABEL: bitcast_v48f16_to_v48i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s43, s29, 16 -; GFX9-NEXT: s_lshr_b32 s42, s28, 16 -; GFX9-NEXT: s_lshr_b32 s41, s27, 16 -; GFX9-NEXT: s_lshr_b32 s40, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s15, s24, 16 +; GFX9-NEXT: s_lshr_b32 s14, s23, 16 +; GFX9-NEXT: s_lshr_b32 s13, s22, 16 +; GFX9-NEXT: s_lshr_b32 s12, s21, 16 +; GFX9-NEXT: s_lshr_b32 s11, s20, 16 +; GFX9-NEXT: s_lshr_b32 s10, s19, 16 +; GFX9-NEXT: s_lshr_b32 s9, s18, 16 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 @@ -39917,80 +39960,80 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_cbranch_execnz .LBB59_4 ; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v34, 0x200 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s43 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s42 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s40 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s15 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s14 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s13 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s12 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s11 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s10 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s9 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s8 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v34 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s7 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v34 op_sel_hi:[1,0] ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s6 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v34 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 -; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 -; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 -; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 -; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 -; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 -; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 -; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 -; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 -; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 -; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 -; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 -; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 -; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 -; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 -; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 @@ -40005,34 +40048,34 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: .LBB59_3: ; GFX9-NEXT: s_branch .LBB59_2 ; GFX9-NEXT: .LBB59_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v34, s29 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s27 +; GFX9-NEXT: v_mov_b32_e32 v11, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 ; GFX9-NEXT: v_mov_b32_e32 v28, s24 ; GFX9-NEXT: v_mov_b32_e32 v27, s23 ; GFX9-NEXT: v_mov_b32_e32 v26, s22 ; GFX9-NEXT: v_mov_b32_e32 v25, s21 ; GFX9-NEXT: v_mov_b32_e32 v24, s20 -; GFX9-NEXT: v_mov_b32_e32 v33, s19 +; GFX9-NEXT: v_mov_b32_e32 v29, s19 ; GFX9-NEXT: v_mov_b32_e32 v32, s18 ; GFX9-NEXT: v_mov_b32_e32 v31, s17 ; GFX9-NEXT: v_mov_b32_e32 v30, s16 -; GFX9-NEXT: v_mov_b32_e32 v34, s43 -; GFX9-NEXT: v_mov_b32_e32 v35, s42 -; GFX9-NEXT: v_mov_b32_e32 v36, s41 -; GFX9-NEXT: v_mov_b32_e32 v37, s40 -; GFX9-NEXT: v_mov_b32_e32 v38, s15 -; GFX9-NEXT: v_mov_b32_e32 v39, s14 -; GFX9-NEXT: v_mov_b32_e32 v48, s13 -; GFX9-NEXT: v_mov_b32_e32 v49, s12 -; GFX9-NEXT: v_mov_b32_e32 v50, s11 -; GFX9-NEXT: v_mov_b32_e32 v51, s10 -; GFX9-NEXT: v_mov_b32_e32 v52, s9 -; GFX9-NEXT: v_mov_b32_e32 v53, s8 -; GFX9-NEXT: v_mov_b32_e32 v54, s7 -; GFX9-NEXT: v_mov_b32_e32 v55, s6 +; GFX9-NEXT: v_mov_b32_e32 v38, s6 +; GFX9-NEXT: v_mov_b32_e32 v37, s43 +; GFX9-NEXT: v_mov_b32_e32 v36, s42 +; GFX9-NEXT: v_mov_b32_e32 v35, s41 +; GFX9-NEXT: v_mov_b32_e32 v39, s40 +; GFX9-NEXT: v_mov_b32_e32 v48, s15 +; GFX9-NEXT: v_mov_b32_e32 v49, s14 +; GFX9-NEXT: v_mov_b32_e32 v50, s13 +; GFX9-NEXT: v_mov_b32_e32 v51, s12 +; GFX9-NEXT: v_mov_b32_e32 v52, s11 +; GFX9-NEXT: v_mov_b32_e32 v33, s10 +; GFX9-NEXT: v_mov_b32_e32 v53, s9 +; GFX9-NEXT: v_mov_b32_e32 v54, s8 +; GFX9-NEXT: v_mov_b32_e32 v55, s7 ; GFX9-NEXT: .LBB59_5: ; %end ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 @@ -40047,40 +40090,40 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v29 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v29, v39, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 ; GFX9-NEXT: v_lshl_or_b32 v31, v54, 16, v31 ; GFX9-NEXT: v_lshl_or_b32 v32, v53, 16, v32 -; GFX9-NEXT: v_lshl_or_b32 v33, v52, 16, v33 -; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 -; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 -; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 -; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 -; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v50, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v49, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v48, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v37, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v30 ; GFX9-NEXT: v_mov_b32_e32 v1, v31 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 97d040b545c09..e29aa279a88f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -184,7 +184,8 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; SI-LABEL: bitcast_v26i32_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -198,7 +199,6 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -211,6 +211,7 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -250,7 +251,8 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; VI-LABEL: bitcast_v26i32_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -264,7 +266,6 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -277,6 +278,7 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB1_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -316,7 +318,8 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; GFX9-LABEL: bitcast_v26i32_to_v26f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -330,7 +333,6 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -343,6 +345,7 @@ define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -617,7 +620,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; SI-LABEL: bitcast_v26f32_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -631,7 +635,6 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -644,6 +647,7 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -683,7 +687,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; VI-LABEL: bitcast_v26f32_to_v26i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -697,7 +702,6 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -710,6 +714,7 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB3_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -749,7 +754,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; GFX9-LABEL: bitcast_v26f32_to_v26i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -763,7 +769,6 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -776,6 +781,7 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1050,7 +1056,8 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v26i32_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -1064,7 +1071,6 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1077,6 +1083,7 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1116,7 +1123,8 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; VI-LABEL: bitcast_v26i32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -1130,7 +1138,6 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1143,6 +1150,7 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB5_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1182,7 +1190,8 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; GFX9-LABEL: bitcast_v26i32_to_v13i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -1196,7 +1205,6 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1209,6 +1217,7 @@ define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1503,7 +1512,8 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v13i64_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -1517,7 +1527,6 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1530,6 +1539,7 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1569,7 +1579,8 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; VI-LABEL: bitcast_v13i64_to_v26i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -1583,7 +1594,6 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1596,6 +1606,7 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1635,7 +1646,8 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; GFX9-LABEL: bitcast_v13i64_to_v26i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -1649,7 +1661,6 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1662,6 +1673,7 @@ define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1956,7 +1968,8 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; SI-LABEL: bitcast_v26i32_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -1970,7 +1983,6 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1983,6 +1995,7 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -2022,7 +2035,8 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; VI-LABEL: bitcast_v26i32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -2036,7 +2050,6 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -2049,6 +2062,7 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB9_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -2088,7 +2102,8 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; GFX9-LABEL: bitcast_v26i32_to_v13f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -2102,7 +2117,6 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -2115,6 +2129,7 @@ define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -2350,33 +2365,34 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; SI-LABEL: bitcast_v13f64_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -2403,33 +2419,34 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; VI-LABEL: bitcast_v13f64_to_v26i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v11, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -2456,33 +2473,34 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; GFX9-LABEL: bitcast_v13f64_to_v26i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -5870,18 +5888,20 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -5921,14 +5941,11 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -5943,20 +5960,21 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6130,6 +6148,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -6145,7 +6164,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -7675,7 +7693,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 @@ -7720,7 +7738,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 @@ -7800,7 +7818,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 @@ -7830,7 +7848,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 @@ -7975,24 +7993,24 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 @@ -8077,10 +8095,10 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -10567,18 +10585,20 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -10618,14 +10638,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -10640,17 +10657,18 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB19_3 ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -10702,16 +10720,16 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v16, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v17, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_or_b32_e32 v17, v19, v18 ; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -10788,6 +10806,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -10803,7 +10822,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -11345,7 +11363,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; SI-LABEL: bitcast_v26f32_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -11359,7 +11378,6 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -11372,6 +11390,7 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -11411,7 +11430,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; VI-LABEL: bitcast_v26f32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -11425,7 +11445,6 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -11438,6 +11457,7 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB21_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -11477,7 +11497,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; GFX9-LABEL: bitcast_v26f32_to_v13i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -11491,7 +11512,6 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -11504,6 +11524,7 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -11785,7 +11806,8 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; SI-LABEL: bitcast_v13i64_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -11799,7 +11821,6 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -11812,6 +11833,7 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -11851,7 +11873,8 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; VI-LABEL: bitcast_v13i64_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -11865,7 +11888,6 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -11878,6 +11900,7 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -11917,7 +11940,8 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; GFX9-LABEL: bitcast_v13i64_to_v26f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -11931,7 +11955,6 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -11944,6 +11967,7 @@ define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -12225,7 +12249,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; SI-LABEL: bitcast_v26f32_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -12239,7 +12264,6 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -12252,6 +12276,7 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -12291,7 +12316,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; VI-LABEL: bitcast_v26f32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -12305,7 +12331,6 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -12318,6 +12343,7 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -12357,7 +12383,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; GFX9-LABEL: bitcast_v26f32_to_v13f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -12371,7 +12398,6 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -12384,6 +12410,7 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -12606,33 +12633,34 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; SI-LABEL: bitcast_v13f64_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -12659,33 +12687,34 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; VI-LABEL: bitcast_v13f64_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v11, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -12712,33 +12741,34 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; GFX9-LABEL: bitcast_v13f64_to_v26f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -13643,13 +13673,13 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v27, s17 ; SI-NEXT: v_mov_b32_e32 v25, s18 ; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v21, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s20 +; SI-NEXT: v_mov_b32_e32 v21, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 ; SI-NEXT: v_mov_b32_e32 v16, s26 ; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v14, s28 @@ -13661,7 +13691,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 @@ -13669,9 +13699,9 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 -; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v20, 16 +; SI-NEXT: v_alignbit_b32 v50, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v52, v21, v23, 16 ; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 @@ -13683,11 +13713,11 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) @@ -13698,12 +13728,12 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 @@ -13720,7 +13750,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 @@ -13728,9 +13758,9 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 ; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 -; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v20, 16 +; SI-NEXT: v_alignbit_b32 v50, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v52, v21, v23, 16 ; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 @@ -13742,11 +13772,11 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) @@ -13775,38 +13805,38 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 @@ -13897,7 +13927,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13939,7 +13969,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB29_2 ; @@ -13947,16 +13977,16 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 -; VI-NEXT: v_mov_b32_e32 v18, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: v_mov_b32_e32 v25, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v22, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v25, s24 ; VI-NEXT: v_mov_b32_e32 v23, s25 ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v19, s27 @@ -13985,15 +14015,15 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: s_cbranch_execnz .LBB29_3 ; VI-NEXT: .LBB29_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 @@ -14013,8 +14043,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -14022,6 +14050,8 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 @@ -14039,34 +14069,34 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: .LBB29_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 ; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 @@ -14152,16 +14182,16 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v18, s18 -; GFX9-NEXT: v_mov_b32_e32 v17, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v22, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v25, s24 ; GFX9-NEXT: v_mov_b32_e32 v23, s25 ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v19, s27 @@ -14190,15 +14220,15 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: s_cbranch_execnz .LBB29_3 ; GFX9-NEXT: .LBB29_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 @@ -14218,8 +14248,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -14227,6 +14255,8 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 @@ -14244,21 +14274,17 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: .LBB29_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 @@ -14268,13 +14294,13 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 @@ -14287,26 +14313,30 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 @@ -16095,18 +16125,20 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -16146,14 +16178,11 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -16168,20 +16197,21 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -16355,6 +16385,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -16370,7 +16401,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -17881,12 +17911,12 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 @@ -17928,7 +17958,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 @@ -17943,7 +17973,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 @@ -17958,7 +17988,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 ; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 ; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 @@ -17971,7 +18001,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 ; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s11, 1.0 ; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 ; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 ; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 @@ -17984,13 +18014,13 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 @@ -18001,7 +18031,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12 ; SI-NEXT: s_waitcnt expcnt(1) @@ -18015,7 +18045,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 @@ -18028,9 +18058,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v3 @@ -18041,14 +18071,14 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -18098,7 +18128,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v50 ; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v30, v4 @@ -18188,12 +18218,12 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 @@ -18202,8 +18232,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 @@ -18272,7 +18302,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr39 @@ -18299,11 +18329,11 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -18320,16 +18350,16 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 -; VI-NEXT: v_mov_b32_e32 v18, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: v_mov_b32_e32 v25, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v22, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v25, s24 ; VI-NEXT: v_mov_b32_e32 v23, s25 ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v19, s27 @@ -18358,15 +18388,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: s_cbranch_execnz .LBB33_3 ; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 @@ -18386,8 +18416,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -18395,6 +18423,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 @@ -18412,34 +18442,34 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: .LBB33_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 ; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 @@ -18525,16 +18555,16 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v18, s18 -; GFX9-NEXT: v_mov_b32_e32 v17, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v22, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v25, s24 ; GFX9-NEXT: v_mov_b32_e32 v23, s25 ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v19, s27 @@ -18563,15 +18593,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: s_cbranch_execnz .LBB33_3 ; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 @@ -18591,8 +18621,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -18600,6 +18628,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 @@ -18617,21 +18647,17 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: .LBB33_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 @@ -18641,13 +18667,13 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 @@ -18660,26 +18686,30 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 @@ -20788,18 +20818,20 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -20839,14 +20871,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -20861,17 +20890,18 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB35_3 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -20923,16 +20953,16 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v16, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v17, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_or_b32_e32 v17, v19, v18 ; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -21009,6 +21039,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -21024,7 +21055,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -21586,7 +21616,8 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; SI-LABEL: bitcast_v13i64_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v23, v9 @@ -21600,7 +21631,6 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -21613,6 +21643,7 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -21652,7 +21683,8 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; VI-LABEL: bitcast_v13i64_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 @@ -21666,7 +21698,6 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -21679,6 +21710,7 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB37_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -21718,7 +21750,8 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; GFX9-LABEL: bitcast_v13i64_to_v13f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 ; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 @@ -21732,7 +21765,6 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -21745,6 +21777,7 @@ define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -21987,33 +22020,34 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; SI-LABEL: bitcast_v13f64_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -22040,33 +22074,34 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; VI-LABEL: bitcast_v13f64_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v11, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -22093,33 +22128,34 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; GFX9-LABEL: bitcast_v13f64_to_v13i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -25521,18 +25557,20 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -25572,14 +25610,11 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -25594,20 +25629,21 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -25781,6 +25817,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -25796,7 +25833,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -27341,7 +27377,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 @@ -27386,7 +27422,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 @@ -27466,7 +27502,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 @@ -27496,7 +27532,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 @@ -27641,24 +27677,24 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 @@ -27743,10 +27779,10 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -30233,18 +30269,20 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -30284,14 +30322,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -30306,17 +30341,18 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB47_3 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -30368,16 +30404,16 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v16, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v17, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_or_b32_e32 v17, v19, v18 ; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -30454,6 +30490,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -30469,7 +30506,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -31625,15 +31661,15 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 ; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v21, s20 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v26, s17 ; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v22, s21 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v15, s26 ; SI-NEXT: v_mov_b32_e32 v16, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -31654,9 +31690,9 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 @@ -31668,11 +31704,11 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) @@ -31681,9 +31717,9 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 @@ -31700,9 +31736,9 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 @@ -31714,11 +31750,11 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) @@ -31747,38 +31783,38 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v42 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 @@ -31919,53 +31955,53 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v18, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: v_mov_b32_e32 v31, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v19, s26 -; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_mov_b32_e32 v25, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v26, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 -; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -31974,95 +32010,95 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; VI-NEXT: v_or_b32_sdwa v26, v19, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v35, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v12, v21, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v13, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, v36 ; VI-NEXT: v_mov_b32_e32 v1, v37 @@ -32081,17 +32117,18 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: .LBB49_4: ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 @@ -32104,60 +32141,59 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: s_branch .LBB49_2 ; ; GFX9-LABEL: bitcast_v13f64_to_v52i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v18, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: v_mov_b32_e32 v31, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v19, s26 -; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_mov_b32_e32 v25, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 -; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -32166,96 +32202,96 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 -; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 -; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v36 ; GFX9-NEXT: v_mov_b32_e32 v1, v37 ; GFX9-NEXT: v_mov_b32_e32 v2, v26 @@ -32273,17 +32309,18 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 @@ -32296,7 +32333,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: s_branch .LBB49_2 ; ; GFX11-LABEL: bitcast_v13f64_to_v52i16_scalar: @@ -34041,18 +34077,20 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -34092,14 +34130,11 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -34114,20 +34149,21 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -34301,6 +34337,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -34316,7 +34353,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -35779,7 +35815,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_lshr_b32 s40, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 ; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 ; SI-NEXT: s_lshr_b32 s40, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s40 ; SI-NEXT: s_lshr_b32 s40, s12, 16 @@ -35869,7 +35905,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_f64 v[26:27], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 ; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[10:11], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 @@ -35885,22 +35921,22 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -35916,14 +35952,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 @@ -35945,131 +35981,131 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 ; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 -; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: buffer_store_dword v14, v10, s[0:3], 0 offen ; SI-NEXT: v_cvt_f16_f32_e32 v10, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 ; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 ; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 ; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v50 ; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 ; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 ; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 ; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 ; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 ; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 ; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 ; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 ; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 ; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 ; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 ; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 @@ -36180,7 +36216,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr45 @@ -36201,53 +36237,53 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v18, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: v_mov_b32_e32 v31, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v19, s26 -; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_mov_b32_e32 v25, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v26, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 -; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -36256,95 +36292,95 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; VI-NEXT: v_or_b32_sdwa v26, v19, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v35, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v12, v21, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v13, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, v36 ; VI-NEXT: v_mov_b32_e32 v1, v37 @@ -36363,17 +36399,18 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: .LBB53_4: ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 @@ -36386,60 +36423,59 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: s_branch .LBB53_2 ; ; GFX9-LABEL: bitcast_v13f64_to_v52f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v18, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: v_mov_b32_e32 v31, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v19, s26 -; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_mov_b32_e32 v25, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 -; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -36448,96 +36484,96 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 -; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 -; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v36 ; GFX9-NEXT: v_mov_b32_e32 v1, v37 ; GFX9-NEXT: v_mov_b32_e32 v2, v26 @@ -36555,17 +36591,18 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: .LBB53_4: ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 @@ -36578,7 +36615,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: s_branch .LBB53_2 ; ; GFX11-LABEL: bitcast_v13f64_to_v52f16_scalar: @@ -38643,18 +38679,20 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 @@ -38694,14 +38732,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v25, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -38716,17 +38751,18 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: v_mov_b32_e32 v11, s61 ; VI-NEXT: v_mov_b32_e32 v12, s62 ; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_cbranch_execnz .LBB55_3 ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -38778,16 +38814,16 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v16, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v17, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_or_b32_e32 v17, v19, v18 ; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -38864,6 +38900,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -38879,7 +38916,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 @@ -40590,9 +40626,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 @@ -40630,64 +40666,75 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v45 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 @@ -40720,28 +40767,16 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 ; SI-NEXT: s_branch .LBB57_3 ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr31 @@ -40749,15 +40784,11 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -40765,16 +40796,20 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -40826,12 +40861,13 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: v_mov_b32_e32 v62, v32 -; SI-NEXT: v_mov_b32_e32 v32, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v31, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 ; SI-NEXT: v_mov_b32_e32 v51, v53 ; SI-NEXT: v_mov_b32_e32 v53, v55 ; SI-NEXT: v_mov_b32_e32 v55, v41 @@ -40839,118 +40875,130 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: s_cbranch_vccnz .LBB57_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 ; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 ; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v63, v42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -41034,36 +41082,28 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -41071,16 +41111,18 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -41089,20 +41131,22 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -41195,29 +41239,25 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -41225,68 +41265,64 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -41337,10 +41373,10 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; VI-NEXT: s_cbranch_scc0 .LBB57_4 @@ -41380,11 +41416,11 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 @@ -41421,7 +41457,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; VI-NEXT: s_and_b32 s18, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 ; VI-NEXT: s_or_b32 s14, s18, s14 ; VI-NEXT: s_and_b32 s18, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s13, s13, 16 @@ -41431,7 +41467,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; VI-NEXT: s_and_b32 s18, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 ; VI-NEXT: s_or_b32 s12, s18, s12 ; VI-NEXT: s_and_b32 s18, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s11, s11, 16 @@ -41510,8 +41546,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 @@ -43710,8 +43746,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 @@ -43906,8 +43942,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index a43ce77b20631..98e7ddea762d0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -213,8 +213,8 @@ define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -284,8 +284,8 @@ define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -355,8 +355,8 @@ define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -671,8 +671,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -742,8 +742,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -813,8 +813,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -1129,8 +1129,8 @@ define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -1200,8 +1200,8 @@ define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -1271,8 +1271,8 @@ define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -1608,8 +1608,8 @@ define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -1679,8 +1679,8 @@ define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -1750,8 +1750,8 @@ define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -2087,8 +2087,8 @@ define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -2158,8 +2158,8 @@ define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -2229,8 +2229,8 @@ define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -2498,12 +2498,12 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v29, v1 ; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -2556,12 +2556,12 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v29, v1 ; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -2614,12 +2614,12 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v29, v1 ; GFX9-NEXT: v_mov_b32_e32 v28, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -6305,18 +6305,20 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -6366,12 +6368,10 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -6394,22 +6394,22 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6587,6 +6587,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -6602,7 +6603,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -8302,7 +8302,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 @@ -8350,7 +8350,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 @@ -8436,7 +8436,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 @@ -8469,7 +8469,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 @@ -8614,24 +8614,24 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 @@ -8734,10 +8734,10 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -11450,18 +11450,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -11511,12 +11513,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -11537,13 +11537,13 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -11601,16 +11601,16 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_or_b32_e32 v19, v21, v20 ; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -11691,6 +11691,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -11706,7 +11707,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -12316,8 +12316,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -12387,8 +12387,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -12458,8 +12458,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -12781,8 +12781,8 @@ define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -12852,8 +12852,8 @@ define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -12923,8 +12923,8 @@ define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -13246,8 +13246,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -13317,8 +13317,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -13388,8 +13388,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -13643,12 +13643,12 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v29, v1 ; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -13701,12 +13701,12 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v29, v1 ; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -13759,12 +13759,12 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v29, v1 ; GFX9-NEXT: v_mov_b32_e32 v28, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -14757,13 +14757,13 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; SI-NEXT: v_mov_b32_e32 v30, s16 ; SI-NEXT: v_mov_b32_e32 v29, s17 -; SI-NEXT: v_mov_b32_e32 v25, s18 -; SI-NEXT: v_mov_b32_e32 v23, s19 -; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v26, s19 +; SI-NEXT: v_mov_b32_e32 v24, s20 +; SI-NEXT: v_mov_b32_e32 v21, s21 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_mov_b32_e32 v19, s25 ; SI-NEXT: v_mov_b32_e32 v18, s26 @@ -14781,7 +14781,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 @@ -14791,10 +14791,10 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 ; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v54, v23, v25, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v24, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: v_alignbit_b32 v42, v26, v28, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 @@ -14808,23 +14808,23 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v21 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -14845,7 +14845,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 16 ; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 @@ -14855,10 +14855,10 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 ; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v54, v23, v25, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v24, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: v_alignbit_b32 v42, v26, v28, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 @@ -14872,11 +14872,11 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v21 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 ; SI-NEXT: .LBB29_3: ; %end @@ -14891,48 +14891,48 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 -; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v25, v29, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 @@ -15037,7 +15037,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15085,7 +15085,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_branch .LBB29_2 ; @@ -15093,18 +15093,18 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v20, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v27, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v24, s24 -; VI-NEXT: v_mov_b32_e32 v23, s25 -; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v25, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 ; VI-NEXT: v_mov_b32_e32 v21, s27 ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v14, s29 @@ -15135,17 +15135,17 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; VI-NEXT: s_cbranch_execnz .LBB29_3 ; VI-NEXT: .LBB29_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 @@ -15165,17 +15165,17 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 @@ -15193,40 +15193,40 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; VI-NEXT: .LBB29_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v24, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 ; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 @@ -15320,18 +15320,18 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v27, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v23, s25 -; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 ; GFX9-NEXT: v_mov_b32_e32 v21, s27 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v14, s29 @@ -15362,17 +15362,17 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; GFX9-NEXT: s_cbranch_execnz .LBB29_3 ; GFX9-NEXT: .LBB29_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 @@ -15392,17 +15392,17 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 @@ -15420,31 +15420,27 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; GFX9-NEXT: .LBB29_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 @@ -15459,20 +15455,20 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 @@ -15484,11 +15480,15 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 @@ -17420,18 +17420,20 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -17481,12 +17483,10 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -17509,22 +17509,22 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -17702,6 +17702,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -17717,7 +17718,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -19396,7 +19396,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 @@ -19444,7 +19444,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 @@ -19496,7 +19496,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 ; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 ; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 @@ -19506,7 +19506,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 ; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 @@ -19519,7 +19519,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 @@ -19529,7 +19529,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 @@ -19540,7 +19540,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -19550,7 +19550,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -19564,7 +19564,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 @@ -19713,7 +19713,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -19728,7 +19728,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 @@ -19838,10 +19838,10 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -19863,18 +19863,18 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v20, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v27, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v24, s24 -; VI-NEXT: v_mov_b32_e32 v23, s25 -; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v25, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 ; VI-NEXT: v_mov_b32_e32 v21, s27 ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v14, s29 @@ -19905,17 +19905,17 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; VI-NEXT: s_cbranch_execnz .LBB33_3 ; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 @@ -19935,17 +19935,17 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 @@ -19963,40 +19963,40 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; VI-NEXT: .LBB33_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v24, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 ; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 @@ -20090,18 +20090,18 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v20, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v27, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v23, s25 -; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 ; GFX9-NEXT: v_mov_b32_e32 v21, s27 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v14, s29 @@ -20132,17 +20132,17 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; GFX9-NEXT: s_cbranch_execnz .LBB33_3 ; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 @@ -20162,17 +20162,17 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 @@ -20190,31 +20190,27 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; GFX9-NEXT: .LBB33_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 @@ -20229,20 +20225,20 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 @@ -20254,11 +20250,15 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 @@ -22550,18 +22550,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -22611,12 +22613,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -22637,13 +22637,13 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -22701,16 +22701,16 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_or_b32_e32 v19, v21, v20 ; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -22791,6 +22791,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -22806,7 +22807,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -23437,8 +23437,8 @@ define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -23508,8 +23508,8 @@ define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -23579,8 +23579,8 @@ define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -23855,12 +23855,12 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v29, v1 ; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 @@ -23913,12 +23913,12 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v29, v1 ; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 @@ -23971,12 +23971,12 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v29, v1 ; GFX9-NEXT: v_mov_b32_e32 v28, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 @@ -27676,18 +27676,20 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -27737,12 +27739,10 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -27765,22 +27765,22 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -27958,6 +27958,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -27973,7 +27974,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -29687,7 +29687,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 @@ -29735,7 +29735,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 @@ -29821,7 +29821,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 @@ -29854,7 +29854,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 @@ -29999,24 +29999,24 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 @@ -30119,10 +30119,10 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -32835,18 +32835,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -32896,12 +32898,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -32922,13 +32922,13 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -32986,16 +32986,16 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_or_b32_e32 v19, v21, v20 ; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -33076,6 +33076,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -33091,7 +33092,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -34362,13 +34362,13 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v21, s20 ; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v26, s19 +; SI-NEXT: v_mov_b32_e32 v22, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v17, s26 @@ -34397,10 +34397,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 ; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v42, v26, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 @@ -34414,19 +34414,19 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 @@ -34447,10 +34447,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 ; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v42, v26, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 @@ -34464,11 +34464,11 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 ; SI-NEXT: .LBB49_3: ; %end @@ -34483,38 +34483,38 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v21, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 @@ -34685,21 +34685,21 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v32, s20 -; VI-NEXT: v_mov_b32_e32 v33, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v36, s24 +; VI-NEXT: v_mov_b32_e32 v37, s25 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v17, s28 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -34724,20 +34724,20 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -34747,13 +34747,13 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -34768,52 +34768,52 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v23, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 -; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34868,12 +34868,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 @@ -34898,21 +34898,21 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v32, s20 -; GFX9-NEXT: v_mov_b32_e32 v33, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v17, s28 -; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -34937,20 +34937,20 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -34960,13 +34960,13 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -34981,60 +34981,56 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v19 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 @@ -35047,10 +35043,14 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 @@ -35081,12 +35081,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr48 @@ -36984,18 +36984,20 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -37045,12 +37047,10 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -37073,22 +37073,22 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -37266,6 +37266,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -37281,7 +37282,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -38893,11 +38893,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_lshr_b32 s42, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s42 ; SI-NEXT: s_lshr_b32 s42, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 ; SI-NEXT: s_lshr_b32 s42, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 ; SI-NEXT: s_lshr_b32 s42, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 ; SI-NEXT: s_lshr_b32 s42, s11, 16 @@ -38907,7 +38907,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_lshr_b32 s42, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 ; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 ; SI-NEXT: s_lshr_b32 s42, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 ; SI-NEXT: s_lshr_b32 s42, s14, 16 @@ -38956,8 +38956,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 @@ -39010,7 +39010,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 @@ -39025,7 +39025,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 ; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 @@ -39034,21 +39034,21 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 @@ -39064,15 +39064,15 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -39218,15 +39218,15 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -39253,7 +39253,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -39267,7 +39267,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -39342,9 +39342,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -39353,11 +39353,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -39368,21 +39368,21 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v32, s20 -; VI-NEXT: v_mov_b32_e32 v33, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v36, s24 +; VI-NEXT: v_mov_b32_e32 v37, s25 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v17, s28 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -39407,20 +39407,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -39430,13 +39430,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -39451,52 +39451,52 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v23, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 -; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -39551,12 +39551,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 @@ -39581,21 +39581,21 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v32, s20 -; GFX9-NEXT: v_mov_b32_e32 v33, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v17, s28 -; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -39620,20 +39620,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -39643,13 +39643,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -39664,60 +39664,56 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v19 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 @@ -39730,10 +39726,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 @@ -39764,12 +39764,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr48 @@ -42027,18 +42027,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 @@ -42088,12 +42090,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -42114,13 +42114,13 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -42178,16 +42178,16 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_or_b32_e32 v19, v21, v20 ; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -42268,6 +42268,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -42283,7 +42284,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 @@ -44232,8 +44232,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44245,36 +44245,36 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 ; SI-NEXT: v_mov_b32_e32 v47, v34 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44357,7 +44357,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB57_3 ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr33 @@ -44368,26 +44368,26 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -44459,11 +44459,11 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v33, v34 ; SI-NEXT: v_mov_b32_e32 v34, v36 ; SI-NEXT: v_mov_b32_e32 v36, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v52, v54 -; SI-NEXT: v_mov_b32_e32 v54, v40 -; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v51, v53 +; SI-NEXT: v_mov_b32_e32 v53, v55 +; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_mov_b32_e32 v42, v44 ; SI-NEXT: v_mov_b32_e32 v44, v46 ; SI-NEXT: s_cbranch_vccnz .LBB57_5 @@ -44503,7 +44503,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 ; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 ; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 ; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 @@ -44522,25 +44522,25 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v46 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 @@ -44553,7 +44553,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload @@ -44561,7 +44561,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload @@ -44569,7 +44569,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -44710,13 +44710,13 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -44741,7 +44741,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -44914,61 +44914,61 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 @@ -45020,8 +45020,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 @@ -45202,8 +45202,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 @@ -47614,8 +47614,8 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 @@ -47830,8 +47830,8 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f46875076809..b3e469bb89100 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -218,7 +218,6 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -231,6 +230,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -293,7 +293,6 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -306,6 +305,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB1_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -368,7 +368,6 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -381,6 +380,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -698,7 +698,6 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -711,6 +710,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -773,7 +773,6 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -786,6 +785,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB3_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -848,7 +848,6 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -861,6 +860,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1178,7 +1178,6 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1191,6 +1190,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1253,7 +1253,6 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1266,6 +1265,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB5_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1328,7 +1328,6 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1341,6 +1340,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -1681,7 +1681,6 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1694,6 +1693,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -1756,7 +1756,6 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1769,6 +1768,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -1831,7 +1831,6 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1844,6 +1843,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -2184,7 +2184,6 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -2197,6 +2196,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -2259,7 +2259,6 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -2272,6 +2271,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB9_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -2334,7 +2334,6 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -2347,6 +2346,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -2603,35 +2603,35 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v31, v3 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v30, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -2664,35 +2664,35 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v30, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -2725,35 +2725,35 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -6706,13 +6706,15 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -6772,12 +6774,10 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -6800,11 +6800,11 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -6816,14 +6816,14 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -7005,6 +7005,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -7020,7 +7021,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -8890,7 +8890,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 @@ -8940,7 +8940,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 @@ -9032,7 +9032,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 @@ -9067,7 +9067,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 @@ -9212,24 +9212,24 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 @@ -9350,10 +9350,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -12347,13 +12347,15 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_cbranch_scc0 .LBB19_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -12413,12 +12415,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -12439,13 +12439,13 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -12509,16 +12509,16 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_or_b32_e32 v21, v23, v22 ; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -12603,6 +12603,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -12618,7 +12619,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -13272,7 +13272,6 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -13285,6 +13284,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -13347,7 +13347,6 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -13360,6 +13359,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB21_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -13422,7 +13422,6 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -13435,6 +13434,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -13760,7 +13760,6 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -13773,6 +13772,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -13835,7 +13835,6 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -13848,6 +13847,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -13910,7 +13910,6 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -13923,6 +13922,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -14248,7 +14248,6 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -14261,6 +14260,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -14323,7 +14323,6 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -14336,6 +14335,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -14398,7 +14398,6 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -14411,6 +14410,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -14652,35 +14652,35 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v31, v3 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v30, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -14713,35 +14713,35 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v30, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -14774,35 +14774,35 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -15867,14 +15867,14 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v33, s18 +; SI-NEXT: v_mov_b32_e32 v32, s16 +; SI-NEXT: v_mov_b32_e32 v30, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v25, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v32, s19 -; SI-NEXT: v_mov_b32_e32 v29, s20 -; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v31, s20 +; SI-NEXT: v_mov_b32_e32 v29, s21 +; SI-NEXT: v_mov_b32_e32 v26, s22 ; SI-NEXT: v_mov_b32_e32 v24, s23 ; SI-NEXT: v_mov_b32_e32 v23, s24 ; SI-NEXT: v_mov_b32_e32 v21, s25 @@ -15898,8 +15898,8 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 @@ -15908,12 +15908,12 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v26, 16 +; SI-NEXT: v_alignbit_b32 v44, v29, v31, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_alignbit_b32 v56, v30, v32, 16 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -15928,21 +15928,21 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 @@ -15966,8 +15966,8 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 ; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 @@ -15976,12 +15976,12 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 ; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v26, 16 +; SI-NEXT: v_alignbit_b32 v44, v29, v31, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_alignbit_b32 v56, v30, v32, 16 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -15996,53 +15996,53 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v30, v30, v56 -; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v32, v32, v56 +; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v30, v32, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 ; SI-NEXT: v_or_b32_e32 v28, v28, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 @@ -16147,7 +16147,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -16159,7 +16159,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -16221,9 +16221,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -16233,20 +16233,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v29, s20 -; VI-NEXT: v_mov_b32_e32 v27, s21 -; VI-NEXT: v_mov_b32_e32 v26, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v24, s24 -; VI-NEXT: v_mov_b32_e32 v23, s25 -; VI-NEXT: v_mov_b32_e32 v22, s26 -; VI-NEXT: v_mov_b32_e32 v21, s27 -; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v28, s22 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v25, s25 +; VI-NEXT: v_mov_b32_e32 v24, s26 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -16279,19 +16279,19 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: s_cbranch_execnz .LBB29_3 ; VI-NEXT: .LBB29_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -16311,19 +16311,19 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -16341,46 +16341,46 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: .LBB29_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 ; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 @@ -16482,20 +16482,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v29, s20 -; GFX9-NEXT: v_mov_b32_e32 v27, s21 -; GFX9-NEXT: v_mov_b32_e32 v26, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v23, s25 -; GFX9-NEXT: v_mov_b32_e32 v22, s26 -; GFX9-NEXT: v_mov_b32_e32 v21, s27 -; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s22 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s25 +; GFX9-NEXT: v_mov_b32_e32 v24, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 ; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -16528,19 +16528,19 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; GFX9-NEXT: s_cbranch_execnz .LBB29_3 ; GFX9-NEXT: .LBB29_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -16560,19 +16560,19 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -16590,47 +16590,43 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; GFX9-NEXT: .LBB29_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 @@ -16639,16 +16635,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 @@ -18716,13 +18716,15 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -18782,12 +18784,10 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -18810,11 +18810,11 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -18826,14 +18826,14 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -19015,6 +19015,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -19030,7 +19031,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -20874,7 +20874,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 @@ -20924,7 +20924,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v63, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 @@ -20995,15 +20995,15 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 ; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s27, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 ; SI-NEXT: v_add_f32_e64 v18, s13, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 ; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 @@ -21029,14 +21029,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 ; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v31, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 ; SI-NEXT: v_add_f32_e64 v33, s11, 1.0 ; SI-NEXT: v_add_f32_e64 v52, s7, 1.0 ; SI-NEXT: v_add_f32_e64 v44, s9, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v52 @@ -21045,7 +21045,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v45, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -21214,7 +21214,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -21229,7 +21229,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -21352,10 +21352,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -21381,20 +21381,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v29, s20 -; VI-NEXT: v_mov_b32_e32 v27, s21 -; VI-NEXT: v_mov_b32_e32 v26, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v24, s24 -; VI-NEXT: v_mov_b32_e32 v23, s25 -; VI-NEXT: v_mov_b32_e32 v22, s26 -; VI-NEXT: v_mov_b32_e32 v21, s27 -; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v28, s22 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v25, s25 +; VI-NEXT: v_mov_b32_e32 v24, s26 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -21427,19 +21427,19 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: s_cbranch_execnz .LBB33_3 ; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -21459,19 +21459,19 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -21489,46 +21489,46 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: .LBB33_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 ; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 @@ -21630,20 +21630,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v29, s20 -; GFX9-NEXT: v_mov_b32_e32 v27, s21 -; GFX9-NEXT: v_mov_b32_e32 v26, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v23, s25 -; GFX9-NEXT: v_mov_b32_e32 v22, s26 -; GFX9-NEXT: v_mov_b32_e32 v21, s27 -; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s22 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s25 +; GFX9-NEXT: v_mov_b32_e32 v24, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 ; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -21676,19 +21676,19 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; GFX9-NEXT: s_cbranch_execnz .LBB33_3 ; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -21708,19 +21708,19 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -21738,47 +21738,43 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; GFX9-NEXT: .LBB33_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 @@ -21787,16 +21783,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 @@ -24334,13 +24334,15 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -24400,12 +24402,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -24426,13 +24426,13 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -24496,16 +24496,16 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_or_b32_e32 v21, v23, v22 ; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -24590,6 +24590,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -24605,7 +24606,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -25282,7 +25282,6 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -25295,6 +25294,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -25357,7 +25357,6 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -25370,6 +25369,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB37_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -25432,7 +25432,6 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -25445,6 +25444,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -25709,35 +25709,35 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v26, v12 ; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v24, v10 ; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v22, v8 ; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v31, v3 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v30, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_mov_b32_e32 v5, s21 ; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -25770,35 +25770,35 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v30, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -25831,35 +25831,35 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 -; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 -; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 ; GFX9-NEXT: v_mov_b32_e32 v23, v9 -; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -29828,13 +29828,15 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -29894,12 +29896,10 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -29922,11 +29922,11 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -29938,14 +29938,14 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -30127,6 +30127,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -30142,7 +30143,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -32029,7 +32029,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 @@ -32079,7 +32079,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 @@ -32171,7 +32171,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 @@ -32206,7 +32206,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 @@ -32351,24 +32351,24 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 @@ -32489,10 +32489,10 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -35486,13 +35486,15 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -35552,12 +35554,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -35578,13 +35578,13 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -35648,16 +35648,16 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_or_b32_e32 v21, v23, v22 ; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -35742,6 +35742,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -35757,7 +35758,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -37142,12 +37142,12 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v27, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v30, s17 +; SI-NEXT: v_mov_b32_e32 v26, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 ; SI-NEXT: v_mov_b32_e32 v23, s22 ; SI-NEXT: v_mov_b32_e32 v24, s23 ; SI-NEXT: v_mov_b32_e32 v21, s24 @@ -37184,11 +37184,11 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 ; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v43, v28, v27, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v46, v26, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v56, v30, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -37203,16 +37203,16 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 @@ -37237,11 +37237,11 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 ; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 ; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v43, v28, v27, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v46, v26, v25, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v56, v30, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -37256,43 +37256,43 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 ; SI-NEXT: .LBB49_3: ; %end ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v56 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v56 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v43 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v59 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 @@ -37493,21 +37493,21 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v29, s22 +; VI-NEXT: v_mov_b32_e32 v30, s23 +; VI-NEXT: v_mov_b32_e32 v27, s24 +; VI-NEXT: v_mov_b32_e32 v28, s25 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v23, s28 +; VI-NEXT: v_mov_b32_e32 v24, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -37538,20 +37538,20 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -37562,13 +37562,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -37585,49 +37585,49 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 -; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 ; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -37727,21 +37727,21 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v29, s18 -; GFX9-NEXT: v_mov_b32_e32 v30, s19 -; GFX9-NEXT: v_mov_b32_e32 v27, s20 -; GFX9-NEXT: v_mov_b32_e32 v28, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v29, s22 +; GFX9-NEXT: v_mov_b32_e32 v30, s23 +; GFX9-NEXT: v_mov_b32_e32 v27, s24 +; GFX9-NEXT: v_mov_b32_e32 v28, s25 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v19, s28 -; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -37772,20 +37772,20 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -37796,13 +37796,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -37819,64 +37819,64 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 @@ -39946,13 +39946,15 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -40012,12 +40014,10 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -40040,11 +40040,11 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -40056,14 +40056,14 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -40245,6 +40245,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -40260,7 +40261,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -42548,21 +42548,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v29, s22 +; VI-NEXT: v_mov_b32_e32 v30, s23 +; VI-NEXT: v_mov_b32_e32 v27, s24 +; VI-NEXT: v_mov_b32_e32 v28, s25 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v23, s28 +; VI-NEXT: v_mov_b32_e32 v24, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -42593,20 +42593,20 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -42617,13 +42617,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -42640,49 +42640,49 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 -; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 ; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -42782,21 +42782,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v29, s18 -; GFX9-NEXT: v_mov_b32_e32 v30, s19 -; GFX9-NEXT: v_mov_b32_e32 v27, s20 -; GFX9-NEXT: v_mov_b32_e32 v28, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v29, s22 +; GFX9-NEXT: v_mov_b32_e32 v30, s23 +; GFX9-NEXT: v_mov_b32_e32 v27, s24 +; GFX9-NEXT: v_mov_b32_e32 v28, s25 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v19, s28 -; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -42827,20 +42827,20 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -42851,13 +42851,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -42874,64 +42874,64 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 @@ -45471,13 +45471,15 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: s_cbranch_scc0 .LBB55_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 -; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 @@ -45537,12 +45539,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -45563,13 +45563,13 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -45633,16 +45633,16 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: v_or_b32_e32 v17, v18, v17 ; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_add_f16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v50 +; VI-NEXT: v_add_f16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v49 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_or_b32_e32 v21, v23, v22 ; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 ; VI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -45727,6 +45727,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 @@ -45742,7 +45743,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 @@ -47875,35 +47875,35 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v48 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v38 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v37 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 @@ -47917,7 +47917,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 @@ -47945,7 +47945,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 @@ -47972,35 +47972,41 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 @@ -48016,12 +48022,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 ; SI-NEXT: s_branch .LBB57_3 ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr31 @@ -48054,15 +48054,15 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -48116,12 +48116,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -48140,10 +48140,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v52, v54 ; SI-NEXT: v_mov_b32_e32 v54, v40 ; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v46, v56 -; SI-NEXT: v_mov_b32_e32 v56, v31 +; SI-NEXT: v_mov_b32_e32 v42, v43 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v45, v47 +; SI-NEXT: v_mov_b32_e32 v47, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 ; SI-NEXT: s_cbranch_vccnz .LBB57_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -48177,64 +48178,63 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 ; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 ; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v37 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 @@ -48242,20 +48242,21 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -48399,9 +48400,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -48411,6 +48409,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end @@ -48624,18 +48625,16 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -48644,7 +48643,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -48652,28 +48651,30 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -48750,8 +48751,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 @@ -48944,10 +48945,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 @@ -49018,9 +49019,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v54, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 ; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 @@ -49072,9 +49073,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 @@ -49171,11 +49172,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 @@ -51566,8 +51567,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 @@ -51802,10 +51803,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 @@ -51852,9 +51853,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v54, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 ; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 @@ -51932,9 +51933,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 @@ -52031,11 +52032,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 879e8520d8e18..91910ecbbfc0b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -10620,15 +10620,17 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v6 ; SI-NEXT: v_mov_b32_e32 v15, v5 ; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v18, v1 ; SI-NEXT: v_mov_b32_e32 v12, v4 ; SI-NEXT: v_mov_b32_e32 v13, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -10654,9 +10656,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v4, v1, v18 -; SI-NEXT: v_or_b32_e32 v8, v6, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v4, v1, v20 +; SI-NEXT: v_or_b32_e32 v8, v6, v19 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -10667,25 +10669,25 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v7, v16, 8, 8 ; SI-NEXT: v_bfe_u32 v11, v15, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll index 7bf9a29e9ff44..7db3656652726 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll @@ -34,7 +34,7 @@ define void @func_illegal_agpr_use_asm() #0 { ; GFX908: v_accvgpr_write_b32 ; GFX90A-NOT: v_accvgpr_write_b32 -; GFX908: NumVgprs: 5 +; GFX908: NumVgprs: 30 ; GFX908: NumAgprs: 32 ; GFX90A: NumVgprs: 35 ; GFX90A: NumAgprs: 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 85b5c7c870b23..ff0e29b97be73 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 77 +; TRAP-HANDLER-ENABLE: NumSgprs: 63 +; TRAP-HANDLER-DISABLE: NumSgprs: 78 define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out0, i32 %in0, ptr addrspace(1) %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 10e523d1a0cf1..fe20f32b327da 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -415,7 +415,6 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -424,6 +423,7 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -444,7 +444,6 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -453,6 +452,7 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -535,7 +535,6 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -544,7 +543,6 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 @@ -552,14 +550,16 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 @@ -584,7 +584,6 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -593,7 +592,6 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 @@ -601,14 +599,16 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 @@ -699,12 +699,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -712,9 +714,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen @@ -722,63 +724,60 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 ; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -796,6 +795,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0 +; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0 +; GFX7-NEXT: v_add_i32_e32 v22, vcc, 16, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -839,35 +841,32 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 40, v0 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(9) ; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -884,20 +883,20 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc ; GFX8-NEXT: s_movk_i32 s4, 0x50 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28 +; GFX8-NEXT: s_movk_i32 s5, 0x60 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc -; GFX8-NEXT: s_movk_i32 s4, 0x60 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, s5, v28 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc ; GFX8-NEXT: s_movk_i32 s4, 0x70 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29] -; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] @@ -1414,153 +1413,152 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_alignbit_b32 v18, v17, v18, 16 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_alignbit_b32 v4, v17, v4, 16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v14, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 -; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v13, v25, v6, 16 +; GCN-NEXT: v_alignbit_b32 v12, v7, v10, 16 +; GCN-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v8, v26, v17, 16 +; GCN-NEXT: v_alignbit_b32 v7, v27, v22, 16 +; GCN-NEXT: v_alignbit_b32 v6, v28, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[15:16], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[15:16], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v9, v0, v23, 16 +; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[15:16], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[15:16], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v2, v6, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_alignbit_b32 v22, v6, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v12, v0, v6, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_alignbit_b32 v11, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v20, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v19, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_alignbit_b32 v21, v8, v9, 16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 +; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 ; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v27 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v28 ; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v25 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v10, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v24 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v10, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1633,200 +1631,201 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[22:23], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v4, v8, v4, 16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 -; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 -; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v7, v6, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16 ; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v32 ; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v17, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v16, v15, v16, 16 ; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_alignbit_b32 v21, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v20, v19, v20, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 -; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 -; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v19, v0, v19, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(5) ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_alignbit_b32 v18, v0, v18, 16 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GCN-NEXT: v_alignbit_b32 v27, v0, v26, 16 +; GCN-NEXT: v_alignbit_b32 v26, v28, v1, 16 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v25, v0, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v31 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v24, v0, v1, 16 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v31, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v30, v28, v29, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v29, v1, v0, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v28, v0, v1, 16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[22:23], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[22:23], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[28:31], v[22:23], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[24:27], v[22:23], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[22:23], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[22:23], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[22:23], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1837,10 +1836,11 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:136 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 @@ -1864,128 +1864,101 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: s_waitcnt vmcnt(8) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 +; GFX7-NEXT: v_mul_f32_e32 v39, 1.0, v34 +; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v35 ; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v36 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_alignbit_b32 v32, v31, v32, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v39, 16 +; GFX7-NEXT: v_alignbit_b32 v31, v31, v35, 16 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:132 +; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 +; GFX7-NEXT: buffer_store_dwordx4 v[31:34], v[48:49], s[4:7], 0 addr64 offset:112 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v35 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v36 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v35, 16 +; GFX7-NEXT: v_alignbit_b32 v32, v31, v32, 16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 ; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GFX7-NEXT: v_alignbit_b32 v31, v31, v35, 16 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v37 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v38 +; GFX7-NEXT: v_alignbit_b32 v38, v35, v36, 16 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX7-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v51 +; GFX7-NEXT: v_alignbit_b32 v37, v37, v39, 16 +; GFX7-NEXT: v_alignbit_b32 v36, v35, v36, 16 +; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:16 ; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v35 ; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 -; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v39, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v39, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 @@ -1998,26 +1971,29 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 ; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 ; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 ; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 ; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; GFX7-NEXT: s_waitcnt vmcnt(8) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v50 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 ; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 @@ -2030,11 +2006,37 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v53 +; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v54 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v20, v19, v20, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v39, v4, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[31:34], v[48:49], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: buffer_store_dwordx4 v[35:38], v[48:49], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[48:49], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[48:49], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[48:49], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[48:49], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[48:49], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2045,34 +2047,34 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: s_movk_i32 s4, 0x70 -; GFX8-NEXT: s_movk_i32 s5, 0x50 +; GFX8-NEXT: s_movk_i32 s5, 0x60 +; GFX8-NEXT: s_movk_i32 s6, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc -; GFX8-NEXT: s_movk_i32 s4, 0x60 +; GFX8-NEXT: v_add_u32_e32 v36, vcc, s5, v32 +; GFX8-NEXT: v_add_u32_e64 v38, s[4:5], s6, v32 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31] +; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v33, vcc +; GFX8-NEXT: v_add_u32_e64 v28, s[6:7], 64, v32 +; GFX8-NEXT: v_add_u32_e64 v30, s[8:9], 48, v32 +; GFX8-NEXT: v_addc_u32_e64 v39, vcc, 0, v33, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v29, vcc, 0, v33, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v31, vcc, 0, v33, s[8:9] ; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32 -; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v32 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v32 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc -; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32 -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32 -; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32 -; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[36:37], v[24:27] +; GFX8-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[30:31], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5203,67 +5205,67 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v20, s30, 0 -; GCN-NEXT: v_writelane_b32 v20, s31, 1 +; GCN-NEXT: v_writelane_b32 v23, s30, 0 +; GCN-NEXT: v_writelane_b32 v23, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 22, v16 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v16 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 18, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 16, v16 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 14, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 12, v16 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 10, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 8, v16 +; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 6, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 4, v16 +; GCN-NEXT: buffer_store_short v11, v21, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5274,31 +5276,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v16 +; GCN-NEXT: buffer_store_short v10, v22, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v9, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v20, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v20, 1 -; GCN-NEXT: v_readlane_b32 s30, v20, 0 +; GCN-NEXT: v_readlane_b32 s31, v23, 1 +; GCN-NEXT: v_readlane_b32 s30, v23, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5705,92 +5712,92 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x78, v0 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: buffer_store_dword v31, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; GCN-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x70, v0 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: buffer_store_dword v32, v34, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x68, v0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x60, v0 +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x5c, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 +; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x54, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x48, v0 +; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0 -; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 64, v0 +; GCN-NEXT: buffer_store_dword v26, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 60, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 56, v0 +; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 52, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0 +; GCN-NEXT: buffer_store_dword v24, v33, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 -; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 44, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; GCN-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; GCN-NEXT: buffer_store_dword v22, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v22, vcc, 28, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 24, v0 +; GCN-NEXT: buffer_store_dword v21, v28, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 20, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v20, v34, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; GCN-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -5838,13 +5845,13 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 60, v0 ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GFX7-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 @@ -5915,13 +5922,13 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 60, v0 ; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0 @@ -5956,15 +5963,16 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 ; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:4 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 -; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 -; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 -; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 -; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 ; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 ; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 ; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 @@ -5988,11 +5996,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX900-NEXT: s_waitcnt vmcnt(25) -; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 -; GFX900-NEXT: s_waitcnt vmcnt(25) -; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 -; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: s_waitcnt vmcnt(27) +; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: s_waitcnt vmcnt(27) ; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 ; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -6653,7 +6660,6 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6662,6 +6668,7 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -6682,7 +6689,6 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6691,6 +6697,7 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6709,7 +6716,6 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6718,33 +6724,59 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v16bf16_to_v16f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v16bf16_to_v16f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX900-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v16bf16_to_v16f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v16bf16_to_v16f32: ; GFX10: ; %bb.0: @@ -6822,7 +6854,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6831,7 +6862,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 @@ -6839,14 +6869,16 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 @@ -6871,7 +6903,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6880,7 +6911,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 @@ -6888,14 +6918,16 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 @@ -6922,7 +6954,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 @@ -6931,7 +6962,6 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 @@ -6939,63 +6969,110 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v32bf16_to_v32f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v32bf16_to_v32f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX900-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 +; GFX900-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 +; GFX900-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX900-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v32bf16_to_v32f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 +; GFX950-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 +; GFX950-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 +; GFX950-NEXT: s_waitcnt vmcnt(3) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; GFX950-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX950-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v32bf16_to_v32f32: ; GFX10: ; %bb.0: @@ -7801,33 +7878,32 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[27:30], v[0:1], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 -; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 -; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 -; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 ; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 @@ -7835,6 +7911,7 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 ; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 ; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -7846,33 +7923,33 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[27:30], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 @@ -7889,33 +7966,33 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[27:30], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 @@ -7926,46 +8003,87 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v16bf16_to_v16f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v16bf16_to_v16f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX900-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:16 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v16bf16_to_v16f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX950-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; GFX950-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v16bf16_to_v16f64: ; GFX10: ; %bb.0: @@ -8063,12 +8181,12 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2 -; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6 -; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 +; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:2 +; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:6 +; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:10 ; GCN-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12 ; GCN-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14 ; GCN-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16 @@ -8079,242 +8197,242 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30 -; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 -; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 -; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 -; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 -; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 -; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 -; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 +; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50 +; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52 +; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54 +; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56 +; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58 +; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60 +; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 ; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 ; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 -; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 -; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 +; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40 +; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42 ; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 ; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xf4, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xe8, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0 +; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xe0, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd8, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd4, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xd0, v0 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0 +; GCN-NEXT: buffer_store_dword v2, v35, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xc8, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xbc, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb8, v0 +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xb4, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb0, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xac, v0 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0 +; GCN-NEXT: buffer_store_dword v2, v35, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa8, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xa4, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0xa0, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 ; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x9c, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x98, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x94, v0 +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0 -; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x90, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x8c, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x88, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0 +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x84, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x80, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x70, v0 +; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x6c, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x68, v0 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x64, v0 +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 +; GCN-NEXT: buffer_store_dword v2, v34, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x5c, v0 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x58, v0 +; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x54, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x50, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v1 +; GCN-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; GCN-NEXT: buffer_store_dword v22, v28, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v18 +; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v18, vcc, 60, v0 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 52, v0 +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v17 +; GCN-NEXT: buffer_store_dword v23, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 44, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 +; GCN-NEXT: buffer_store_dword v22, v36, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GCN-NEXT: buffer_store_dword v17, v27, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 +; GCN-NEXT: buffer_store_dword v16, v35, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GCN-NEXT: buffer_store_dword v16, v37, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v15, v29, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GCN-NEXT: buffer_store_dword v15, v34, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v14, v21, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GCN-NEXT: buffer_store_dword v14, v20, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v12 +; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GCN-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9 -; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 -; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13 -; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v9 +; GCN-NEXT: buffer_store_dword v11, v30, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14 -; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15 -; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 +; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 +; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v11 +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GCN-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -8325,229 +8443,228 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62 -; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60 -; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58 -; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56 -; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54 -; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 -; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 -; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 -; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 -; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 -; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 -; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 -; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 -; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 -; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 -; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 -; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8 -; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 -; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12 -; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14 -; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18 -; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20 -; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22 -; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 -; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 -; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 +; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62 +; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:60 +; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:58 +; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:56 +; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:54 +; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 +; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50 +; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34 +; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:36 +; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:38 +; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:40 +; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:42 +; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:44 +; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:46 +; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:6 +; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10 +; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12 +; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:14 +; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18 +; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:20 +; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22 +; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:24 +; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:26 +; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:28 +; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:30 +; GFX7-NEXT: v_add_i32_e32 v35, vcc, 0xf8, v0 +; GFX7-NEXT: v_add_i32_e32 v36, vcc, 0xd4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xfc, v0 +; GFX7-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX7-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xf4, v0 +; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v22 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GFX7-NEXT: v_add_i32_e32 v21, vcc, 0xf0, v0 +; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xec, v0 +; GFX7-NEXT: v_add_i32_e32 v35, vcc, 0xe8, v0 +; GFX7-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; GFX7-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0 -; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xe4, v0 +; GFX7-NEXT: v_add_i32_e32 v23, vcc, 0xe0, v0 +; GFX7-NEXT: v_add_i32_e32 v35, vcc, 0xdc, v0 +; GFX7-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; GFX7-NEXT: buffer_store_dword v2, v35, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v2 +; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd8, v0 +; GFX7-NEXT: v_add_i32_e32 v25, vcc, 0xd0, v0 +; GFX7-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xc8, v0 +; GFX7-NEXT: buffer_store_dword v22, v20, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xc4, v0 +; GFX7-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v21, vcc, 0xb8, v0 +; GFX7-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xb4, v0 +; GFX7-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v21, vcc, 0xa8, v0 +; GFX7-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xa4, v0 +; GFX7-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xa0, v0 +; GFX7-NEXT: buffer_store_dword v1, v22, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 -; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 +; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x98, v0 +; GFX7-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v18 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x90, v0 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0x8c, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v23, v20, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v15 +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x88, v0 +; GFX7-NEXT: v_add_i32_e32 v23, vcc, 0x84, v0 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 -; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: buffer_store_dword v22, v15, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x80, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[23:24], v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; GFX7-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 +; GFX7-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0 -; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0 -; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 +; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 +; GFX7-NEXT: buffer_store_dword v14, v22, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x70, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x6c, v0 +; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; GFX7-NEXT: buffer_store_dword v12, v17, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7 +; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x64, v0 +; GFX7-NEXT: buffer_store_dword v10, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 ; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v6 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX7-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v4 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 +; GFX7-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 +; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; GFX7-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 60, v0 +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v10, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX7-NEXT: buffer_store_dword v21, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 +; GFX7-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GFX7-NEXT: buffer_store_dword v23, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -8559,501 +8676,499 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v1 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v1 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v1 +; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v5, v[3:4] +; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: flat_load_ushort v7, v[8:9] +; GFX8-NEXT: flat_load_ushort v8, v[10:11] +; GFX8-NEXT: flat_load_ushort v9, v[12:13] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v1 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 14, v1 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 16, v1 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 22, v1 ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 24, v1 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 26, v1 ; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, 28, v1 ; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, 30, v1 ; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, 32, v1 ; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1 -; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 -; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 -; GFX8-NEXT: flat_load_ushort v44, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 -; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v45, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v46, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1 -; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v47, v[52:53] -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1 -; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1 -; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v56, v[54:55] -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1 -; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v57, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v58, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1 -; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 -; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v42, v[42:43] -; GFX8-NEXT: flat_load_ushort v34, v[33:34] -; GFX8-NEXT: flat_load_ushort v36, v[35:36] -; GFX8-NEXT: flat_load_ushort v38, v[37:38] -; GFX8-NEXT: flat_load_ushort v39, v[48:49] -; GFX8-NEXT: flat_load_ushort v48, v[50:51] -; GFX8-NEXT: flat_load_ushort v51, v[52:53] -; GFX8-NEXT: flat_load_ushort v52, v[54:55] -; GFX8-NEXT: flat_load_ushort v53, v[40:41] -; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 -; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v37, v[3:4] -; GFX8-NEXT: flat_load_ushort v35, v[5:6] -; GFX8-NEXT: flat_load_ushort v33, v[7:8] -; GFX8-NEXT: flat_load_ushort v8, v[9:10] -; GFX8-NEXT: flat_load_ushort v6, v[11:12] -; GFX8-NEXT: flat_load_ushort v4, v[13:14] -; GFX8-NEXT: flat_load_ushort v2, v[15:16] -; GFX8-NEXT: flat_load_ushort v1, v[19:20] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 -; GFX8-NEXT: flat_load_ushort v3, v[17:18] -; GFX8-NEXT: flat_load_ushort v5, v[21:22] -; GFX8-NEXT: flat_load_ushort v7, v[23:24] -; GFX8-NEXT: flat_load_ushort v9, v[25:26] -; GFX8-NEXT: flat_load_ushort v10, v[27:28] -; GFX8-NEXT: flat_load_ushort v11, v[29:30] -; GFX8-NEXT: flat_load_ushort v12, v[31:32] -; GFX8-NEXT: flat_load_ushort v13, v[49:50] -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: flat_load_ushort v14, v[3:4] +; GFX8-NEXT: flat_load_ushort v13, v[10:11] +; GFX8-NEXT: flat_load_ushort v15, v[15:16] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 34, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 36, v1 +; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v17, v[17:18] +; GFX8-NEXT: flat_load_ushort v19, v[19:20] +; GFX8-NEXT: flat_load_ushort v21, v[21:22] +; GFX8-NEXT: flat_load_ushort v23, v[23:24] +; GFX8-NEXT: flat_load_ushort v25, v[25:26] +; GFX8-NEXT: flat_load_ushort v27, v[27:28] +; GFX8-NEXT: flat_load_ushort v29, v[29:30] +; GFX8-NEXT: flat_load_ushort v31, v[31:32] +; GFX8-NEXT: flat_load_ushort v33, v[11:12] +; GFX8-NEXT: flat_load_ushort v35, v[34:35] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 38, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v36, vcc, 40, v1 +; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v48, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v38, vcc, 42, v1 +; GFX8-NEXT: v_addc_u32_e32 v39, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v49, v[11:12] +; GFX8-NEXT: flat_load_ushort v50, v[36:37] +; GFX8-NEXT: flat_load_ushort v52, v[38:39] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 62, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v51, v[11:12] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 44, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v36, vcc, 60, v1 +; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v53, v[36:37] +; GFX8-NEXT: v_add_u32_e32 v36, vcc, 46, v1 +; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v38, vcc, 58, v1 +; GFX8-NEXT: v_addc_u32_e32 v39, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v54, v[38:39] +; GFX8-NEXT: v_add_u32_e32 v38, vcc, 48, v1 +; GFX8-NEXT: v_addc_u32_e32 v39, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v55, v[11:12] +; GFX8-NEXT: flat_load_ushort v40, v[36:37] +; GFX8-NEXT: flat_load_ushort v41, v[38:39] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 56, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v42, v[11:12] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 50, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v36, vcc, 54, v1 +; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v43, v[11:12] +; GFX8-NEXT: flat_load_ushort v44, v[36:37] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v45, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[9:10], v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[35:36], v35 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[33:34], v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[31:32], v31 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[29:30], v29 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v27 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 +; GFX8-NEXT: s_waitcnt vmcnt(13) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[37:38], v37 +; GFX8-NEXT: v_add_u32_e32 v48, vcc, 4, v0 +; GFX8-NEXT: s_waitcnt vmcnt(12) +; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v49 +; GFX8-NEXT: buffer_store_dword v38, v48, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(13) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; GFX8-NEXT: v_add_u32_e32 v50, vcc, 0xfc, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v51 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[48:49], v48 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[38:39], v39 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 +; GFX8-NEXT: buffer_store_dword v49, v50, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v49, vcc, 0xf8, v0 +; GFX8-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[48:49], v37 +; GFX8-NEXT: s_waitcnt vmcnt(12) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v53 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[50:51], v37 +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v52 +; GFX8-NEXT: v_add_u32_e32 v52, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v51, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[50:51], v37 +; GFX8-NEXT: s_waitcnt vmcnt(13) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v54 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[52:53], v37 +; GFX8-NEXT: v_add_u32_e32 v54, vcc, 0xec, v0 +; GFX8-NEXT: s_waitcnt vmcnt(12) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v55 +; GFX8-NEXT: buffer_store_dword v53, v54, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v53, vcc, 0xe8, v0 +; GFX8-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[52:53], v37 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[54:55], v37 +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; GFX8-NEXT: v_add_u32_e32 v40, vcc, 0xe4, v0 +; GFX8-NEXT: buffer_store_dword v55, v40, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v55, vcc, 0xe0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(10) +; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v44 +; GFX8-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[54:55], v37 +; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v41 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[40:41], v40 +; GFX8-NEXT: v_add_u32_e32 v44, vcc, 0xdc, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX8-NEXT: s_waitcnt vmcnt(10) +; GFX8-NEXT: v_lshlrev_b32_e32 v43, 16, v45 +; GFX8-NEXT: buffer_store_dword v41, v44, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v41, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[40:41], v43 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 +; GFX8-NEXT: v_add_u32_e32 v44, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v41, v44, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v41, vcc, 0xd0, v0 +; GFX8-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[40:41], v37 +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xcc, v0 +; GFX8-NEXT: buffer_store_dword v43, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xc8, v0 +; GFX8-NEXT: buffer_store_dword v42, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xc4, v0 +; GFX8-NEXT: buffer_store_dword v41, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xc0, v0 +; GFX8-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v55, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xb8, v0 +; GFX8-NEXT: buffer_store_dword v54, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v53, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xb0, v0 +; GFX8-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xac, v0 +; GFX8-NEXT: buffer_store_dword v51, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xa8, v0 +; GFX8-NEXT: buffer_store_dword v50, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xa4, v0 +; GFX8-NEXT: buffer_store_dword v49, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v48, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v39, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v36, v37, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v36, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v35, v36, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v35, vcc, 0x8c, v0 +; GFX8-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x88, v0 +; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x84, v0 +; GFX8-NEXT: buffer_store_dword v32, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v32, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0 +; GFX8-NEXT: buffer_store_dword v30, v31, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v30, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v29, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v28, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x6c, v0 +; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x68, v0 +; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0x64, v0 +; GFX8-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0x5c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX8-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0x50, v0 +; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x4c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GFX8-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x48, v0 +; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x44, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v0 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 60, v0 ; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 56, v0 +; GFX8-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 52, v0 ; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0 -; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0 -; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0 -; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v0 +; GFX8-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v2, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0 -; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: global_extload_v32bf16_to_v32f64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:62 -; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:60 -; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:58 -; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:56 -; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:54 -; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:52 -; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:50 -; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:48 -; GFX900-NEXT: global_load_ushort v18, v[1:2], off offset:46 -; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:44 -; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:42 -; GFX900-NEXT: global_load_ushort v21, v[1:2], off offset:40 -; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36 -; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34 -; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 -; GFX900-NEXT: global_load_ushort v26, v[1:2], off -; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2 +; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:62 +; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:60 +; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:58 +; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:56 +; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:54 +; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:52 +; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:50 +; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:48 +; GFX900-NEXT: global_load_ushort v26, v[1:2], off offset:46 +; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:44 +; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:42 +; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:40 +; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:38 +; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:36 +; GFX900-NEXT: global_load_ushort v32, v[1:2], off +; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:34 +; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:2 +; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:32 +; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:4 +; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:6 ; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 -; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 -; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:18 +; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v35, v[1:2], off offset:30 ; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30 -; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 -; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 -; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4 -; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6 -; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8 -; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10 -; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12 +; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:24 +; GFX900-NEXT: global_load_ushort v10, v[1:2], off offset:26 +; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:28 +; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:8 +; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:10 +; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:12 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14 ; GFX900-NEXT: s_waitcnt vmcnt(31) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v16 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GFX900-NEXT: s_waitcnt vmcnt(28) -; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 ; GFX900-NEXT: s_waitcnt vmcnt(29) -; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:252 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v18 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:248 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 -; GFX900-NEXT: s_waitcnt vmcnt(31) -; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v16 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:244 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX900-NEXT: s_waitcnt vmcnt(31) -; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GFX900-NEXT: s_waitcnt vmcnt(32) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:240 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; GFX900-NEXT: s_waitcnt vmcnt(28) -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 -; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX900-NEXT: s_waitcnt vmcnt(29) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 -; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 -; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 -; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 -; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 -; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 -; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX900-NEXT: s_waitcnt vmcnt(44) -; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX900-NEXT: s_waitcnt vmcnt(38) -; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX900-NEXT: s_waitcnt vmcnt(38) -; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:236 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:232 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 +; GFX900-NEXT: s_waitcnt vmcnt(29) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:228 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:224 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:220 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:216 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:212 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:208 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:204 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:196 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v2 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:200 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:192 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:188 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:180 +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:176 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168 +; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:164 +; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:160 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:156 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v30 +; GFX900-NEXT: s_waitcnt vmcnt(42) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 ; GFX900-NEXT: s_waitcnt vmcnt(41) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX900-NEXT: s_waitcnt vmcnt(39) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:144 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:148 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; GFX900-NEXT: s_waitcnt vmcnt(35) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:152 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:132 +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:140 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[24:25], v13 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:136 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v11 +; GFX900-NEXT: s_waitcnt vmcnt(34) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:120 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[7:8], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 -; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) -; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX900-NEXT: s_waitcnt vmcnt(40) -; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 -; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 -; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 -; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:76 ; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[3:4], v1 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v12 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:72 ; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 -; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 -; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 -; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 -; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 -; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 -; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 -; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9240,184 +9355,180 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX10-NEXT: global_load_ushort v9, v[1:2], off offset:12 ; GFX10-NEXT: global_load_ushort v10, v[1:2], off offset:14 ; GFX10-NEXT: global_load_ushort v11, v[1:2], off offset:16 -; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:18 -; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:20 -; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:22 -; GFX10-NEXT: global_load_ushort v15, v[1:2], off offset:24 -; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26 -; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28 -; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30 -; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62 -; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32 -; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34 -; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36 -; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60 -; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38 -; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40 -; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58 -; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42 -; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44 -; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56 -; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46 -; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48 -; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54 +; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:58 +; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:60 +; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:62 +; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:18 +; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:56 +; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:24 +; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:54 +; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:26 +; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:52 +; GFX10-NEXT: global_load_ushort v36, v[1:2], off offset:48 ; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50 -; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52 +; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:32 +; GFX10-NEXT: global_load_ushort v54, v[1:2], off offset:20 +; GFX10-NEXT: global_load_ushort v55, v[1:2], off offset:22 +; GFX10-NEXT: global_load_ushort v52, v[1:2], off offset:28 +; GFX10-NEXT: global_load_ushort v53, v[1:2], off offset:30 +; GFX10-NEXT: global_load_ushort v37, v[1:2], off offset:34 +; GFX10-NEXT: global_load_ushort v38, v[1:2], off offset:40 +; GFX10-NEXT: global_load_ushort v48, v[1:2], off offset:42 +; GFX10-NEXT: global_load_ushort v49, v[1:2], off offset:44 +; GFX10-NEXT: global_load_ushort v39, v[1:2], off offset:46 +; GFX10-NEXT: global_load_ushort v50, v[1:2], off offset:36 +; GFX10-NEXT: global_load_ushort v51, v[1:2], off offset:38 ; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v4 ; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX10-NEXT: s_waitcnt vmcnt(27) -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; GFX10-NEXT: s_waitcnt vmcnt(26) -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v8 ; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v10 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v11 ; GFX10-NEXT: s_waitcnt vmcnt(22) -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v12 ; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; GFX10-NEXT: s_waitcnt vmcnt(20) -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36 -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v14 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v15 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v16 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v18 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v25 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v20 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v22 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v23 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v24 +; GFX10-NEXT: s_waitcnt vmcnt(19) +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GFX10-NEXT: s_waitcnt vmcnt(18) +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18 -; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30 ; GFX10-NEXT: s_waitcnt vmcnt(14) -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v32 ; GFX10-NEXT: s_waitcnt vmcnt(13) -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21 -; GFX10-NEXT: s_waitcnt vmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22 -; GFX10-NEXT: s_waitcnt vmcnt(11) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25 -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:252 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v27 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v29 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 +; GFX10-NEXT: s_waitcnt vmcnt(11) +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v33 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v23 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[32:33], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[34:35], v34 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[36:37], v36 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v25 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v21 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v38 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v39 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[38:39], v38 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v48 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v50 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:204 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[34:35], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v50 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[36:37], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v52 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:184 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[38:39], v21 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v54 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:156 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:120 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:108 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:104 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:36 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_extload_v32bf16_to_v32f64: @@ -10939,11 +11050,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fadd_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -10956,6 +11062,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_add_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_add_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -10978,11 +11088,9 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -11005,6 +11113,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_add_f32_e32 v10, v10, v26 ; GCN-NEXT: v_add_f32_e32 v9, v9, v25 @@ -11017,6 +11127,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_add_f32_e32 v2, v2, v18 ; GCN-NEXT: v_add_f32_e32 v1, v1, v17 ; GCN-NEXT: v_add_f32_e32 v0, v0, v16 +; GCN-NEXT: v_add_f32_e32 v14, v14, v27 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -11032,7 +11143,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -11042,22 +11153,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -11076,16 +11187,16 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -11108,8 +11219,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 @@ -11128,7 +11239,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -12500,329 +12611,329 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-LABEL: v_fadd_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX8-NEXT: v_add_f32_e32 v31, v32, v31 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_add_f32_e32 v30, v14, v30 +; GFX8-NEXT: s_movk_i32 s6, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s6, v14 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX8-NEXT: v_bfe_u32 v32, v30, 16, 1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 -; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX8-NEXT: v_add_f32_e32 v32, v32, v30 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX8-NEXT: v_add_f32_e32 v33, v30, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_add_f32_e32 v29, v13, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX8-NEXT: v_add_f32_e32 v33, v33, v34 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX8-NEXT: v_add_f32_e32 v30, v15, v30 -; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; GFX8-NEXT: v_add_f32_e32 v31, v31, v32 +; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; GFX8-NEXT: v_add_f32_e32 v32, v15, v32 +; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v31 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v31, v32, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v32 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s6, v31 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 -; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX8-NEXT: v_add_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v33 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 -; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX8-NEXT: v_add_f32_e32 v28, v33, v28 -; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v28, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX8-NEXT: v_add_f32_e32 v27, v33, v27 -; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v27, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX8-NEXT: v_add_f32_e32 v26, v33, v26 -; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v26, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 -; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX8-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v25, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v25 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX8-NEXT: v_add_f32_e32 v24, v33, v24 -; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v24, v8, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v24 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 -; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v23, v33, v23 -; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v23, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v23 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX8-NEXT: v_add_f32_e32 v22, v33, v22 -; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v22, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v22 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 -; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX8-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_add_f32_e32 v21, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v21 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v21 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s6, v20 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX8-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_add_f32_e32 v4, v33, v4 +; GFX8-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX8-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_add_f32_e32 v3, v33, v3 +; GFX8-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_add_f32_e32 v2, v33, v2 +; GFX8-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX8-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_add_f32_e32 v1, v33, v1 +; GFX8-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v0 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], s6, v1 +; GFX8-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v16, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX8-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX8-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX8-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX8-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v21, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v20, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v18, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v17, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_fadd_v32bf16: @@ -12833,277 +12944,277 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX900-NEXT: v_add_f32_e32 v31, v32, v31 ; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_add_f32_e32 v30, v14, v30 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX900-NEXT: v_add_f32_e32 v30, v32, v30 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_add3_u32 v14, v32, v31, s4 ; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc ; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX900-NEXT: v_add_f32_e32 v32, v32, v29 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX900-NEXT: v_add_f32_e32 v33, v30, v33 +; GFX900-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_add_f32_e32 v29, v13, v29 +; GFX900-NEXT: v_add3_u32 v13, v34, v33, s4 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc +; GFX900-NEXT: v_add3_u32 v31, v32, v29, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; GFX900-NEXT: v_add_f32_e32 v32, v29, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v31, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX900-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: v_add_f32_e32 v28, v12, v28 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX900-NEXT: v_add_f32_e32 v33, v33, v34 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_add_f32_e32 v29, v15, v29 -; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; GFX900-NEXT: v_add_f32_e32 v31, v31, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GFX900-NEXT: v_add_f32_e32 v33, v15, v33 +; GFX900-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v31, v33, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v31, v31, v33, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc ; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX900-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX900-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v27, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX900-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v26, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX900-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v25, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX900-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v24, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX900-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v23, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX900-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v22, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX900-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX900-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_add_f32_e32 v5, v33, v5 +; GFX900-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 ; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX900-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_add_f32_e32 v4, v33, v4 +; GFX900-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX900-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_add_f32_e32 v3, v33, v3 +; GFX900-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX900-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_add_f32_e32 v2, v33, v2 +; GFX900-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX900-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_add_f32_e32 v1, v33, v1 +; GFX900-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX900-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v33, vcc +; GFX900-NEXT: v_perm_b32 v0, v0, v16, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v17, s4 +; GFX900-NEXT: v_perm_b32 v2, v3, v18, s4 +; GFX900-NEXT: v_perm_b32 v3, v4, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v5, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v21, v32, s4 +; GFX900-NEXT: v_perm_b32 v6, v22, v6, s4 +; GFX900-NEXT: v_perm_b32 v7, v23, v7, s4 +; GFX900-NEXT: v_perm_b32 v8, v24, v8, s4 +; GFX900-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX900-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX900-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX900-NEXT: v_perm_b32 v12, v28, v12, s4 +; GFX900-NEXT: v_perm_b32 v13, v29, v13, s4 +; GFX900-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v15, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fadd_v32bf16: @@ -13237,272 +13348,272 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29 ; GFX10-NEXT: v_add_f32_e32 v30, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v33, v33, v14 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff -; GFX10-NEXT: v_add_f32_e32 v35, v13, v29 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; GFX10-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v32, v33, v32 +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v31 +; GFX10-NEXT: v_add_f32_e32 v37, v13, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v34, v36, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add_f32_e32 v33, v12, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX10-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_add_f32_e32 v48, v14, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; GFX10-NEXT: v_add_f32_e32 v28, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_add_f32_e32 v35, v36, v12 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; GFX10-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX10-NEXT: v_add_f32_e32 v54, v13, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_add_f32_e32 v34, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add_f32_e32 v33, v36, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GFX10-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v11, v11, v12 +; GFX10-NEXT: v_add3_u32 v12, v34, v31, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v34, v27, v13 +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX10-NEXT: v_bfe_u32 v50, v48, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v12, v29, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v33, v30, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v52, v28, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v28 +; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v27, v12, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_add_f32_e32 v35, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_add_f32_e32 v34, v36, v10 -; GFX10-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_add_f32_e32 v10, v10, v12 +; GFX10-NEXT: v_add3_u32 v12, v36, v32, 0x7fff +; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v35, v35, v26 +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v30, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v12, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v39, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GFX10-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v12, v49, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_add_f32_e32 v33, v36, v33 +; GFX10-NEXT: v_bfe_u32 v36, v10, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v9, v9, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v48, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v39, v39, v25 +; GFX10-NEXT: v_add3_u32 v25, v52, v28, 0x7fff ; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v55, v54, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v51, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_add_f32_e32 v24, v35, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v37, v35, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_add3_u32 v23, v30, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v31, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; GFX10-NEXT: v_bfe_u32 v49, v9, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v51, v52, v51 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v8, v34, v8 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v33, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v30, v30, v33 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc_lo +; GFX10-NEXT: v_add3_u32 v33, v36, v10, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v31, v8, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v51, 16, 1 +; GFX10-NEXT: v_bfe_u32 v32, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v10, v30, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v37, v37, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo -; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX10-NEXT: v_add3_u32 v22, v49, v9, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v8 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v28, v39, 0x7fff +; GFX10-NEXT: v_bfe_u32 v28, v6, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v35, v49, v35 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_add3_u32 v21, v31, v8, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff -; GFX10-NEXT: v_add_f32_e32 v35, v38, v37 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21 -; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6 -; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GFX10-NEXT: v_add3_u32 v8, v11, v51, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v51, v51 +; GFX10-NEXT: v_add3_u32 v32, v32, v7, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v20 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v10, v30, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v39 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v39, v39 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v6 +; GFX10-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v28, v28, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX10-NEXT: v_add_f32_e32 v10, v10, v51 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35 -; GFX10-NEXT: v_add_f32_e32 v8, v21, v8 -; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v30, v30 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5 -; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v48, v49, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4 -; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v49, v51, v49 -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48 +; GFX10-NEXT: v_add3_u32 v20, v49, v35, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v10, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v6, v51, v6 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v4, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5 +; GFX10-NEXT: v_add3_u32 v5, v49, v10, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v51, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v4, v4 +; GFX10-NEXT: v_add3_u32 v4, v49, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff -; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v36, s8 +; GFX10-NEXT: v_add_f32_e32 v6, v54, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v38, s9 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8 -; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v39, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v31, s11 +; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v30, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v35, s13 +; GFX10-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v51, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v52, s6 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10 -; GFX10-NEXT: v_add_f32_e32 v49, v52, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v34, s7 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v6, v18, v6 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49 -; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1 -; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13 -; GFX10-NEXT: v_add_f32_e32 v17, v49, v17 +; GFX10-NEXT: v_bfe_u32 v36, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX10-NEXT: v_add_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v37, v48, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo -; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17 -; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v30, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v51, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v30, v30, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v48, v3, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0 -; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo +; GFX10-NEXT: v_add3_u32 v51, v51, v0, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v30, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v50, s4 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v48, v48, v3, 0x7fff +; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v51, v49, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14 -; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v53, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v54, s16 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v52, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302 +; GFX10-NEXT: v_perm_b32 v12, v25, v12, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v26, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v27, v14, 0x7060302 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo -; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v48, v50, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v10, v5, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v11, v20, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_add_f32_e32 v17, v33, v8 -; GFX10-NEXT: v_add_f32_e32 v15, v15, v16 -; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302 -; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_add_f32_e32 v17, v37, v6 +; GFX10-NEXT: v_add_f32_e32 v11, v15, v16 +; GFX10-NEXT: v_perm_b32 v6, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v32, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v21, v9, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX10-NEXT: v_perm_b32 v9, v22, v18, 0x7060302 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15 -; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX10-NEXT: v_add3_u32 v15, v15, v11, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v33, v23, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_perm_b32 v11, v29, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc_lo ; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13510,302 +13621,293 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11TRUE16-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 -; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11TRUE16-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX11TRUE16-NEXT: v_dual_add_f32 v18, v84, v83 :: v_dual_add_f32 v9, v9, v25 -; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11TRUE16-NEXT: v_add_f32_e32 v17, v86, v85 -; GFX11TRUE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 ; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 -; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 ; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 -; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX11TRUE16-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v49, 0xffff0000, v26 +; GFX11TRUE16-NEXT: v_add_f32_e32 v24, v64, v55 ; GFX11TRUE16-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX11TRUE16-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_add_f32_e32 v23, v66, v65 +; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 ; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX11TRUE16-NEXT: v_add_f32_e32 v20, v80, v71 -; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11TRUE16-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11TRUE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27 -; GFX11TRUE16-NEXT: v_dual_add_f32 v26, v52, v51 :: v_dual_add_f32 v25, v54, v53 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_and_b32 v51, 0xffff0000, v25 +; GFX11TRUE16-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v52, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_dual_add_f32 v20, v80, v71 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_dual_add_f32 v2, v2, v18 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_add_f32_e32 v18, v84, v83 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_add_f32_e32 v25, v54, v53 +; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11TRUE16-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11TRUE16-NEXT: v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11TRUE16-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11TRUE16-NEXT: v_add_f32_e32 v27, v50, v49 +; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11TRUE16-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX11TRUE16-NEXT: v_add_f32_e32 v21, v70, v69 +; GFX11TRUE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v13, 16, v13 ; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11TRUE16-NEXT: v_add_f32_e32 v22, v68, v67 -; GFX11TRUE16-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11TRUE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_add_f32_e32 v26, v52, v51 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v14, 16, v14 ; GFX11TRUE16-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30 ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GFX11TRUE16-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX11TRUE16-NEXT: v_add_f32_e32 v28, v48, v39 ; GFX11TRUE16-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33 -; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 +; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 +; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 ; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 ; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 +; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 +; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 -; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 -; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 -; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 -; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25 +; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 +; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v148, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add_f32_e32 v15, v15, v33 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add_f32_e32 v17, v31, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17 ; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13813,219 +13915,218 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26 -; GFX11FAKE16-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v86, 16, v0 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_add_f32_e32 v17, v86, v85 ; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11FAKE16-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11FAKE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX11FAKE16-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11FAKE16-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX11FAKE16-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83 -; GFX11FAKE16-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11FAKE16-NEXT: v_add_f32_e32 v24, v64, v55 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11FAKE16-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX11FAKE16-NEXT: v_add_f32_e32 v20, v80, v71 +; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX11FAKE16-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX11FAKE16-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_dual_add_f32 v20, v80, v71 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11FAKE16-NEXT: v_add_f32_e32 v19, v82, v81 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX11FAKE16-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11FAKE16-NEXT: v_add_f32_e32 v18, v84, v83 +; GFX11FAKE16-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX11FAKE16-NEXT: v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11FAKE16-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11FAKE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11FAKE16-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX11FAKE16-NEXT: v_add_f32_e32 v21, v70, v69 +; GFX11FAKE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX11FAKE16-NEXT: v_add_f32_e32 v26, v52, v51 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11FAKE16-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11FAKE16-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11FAKE16-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11FAKE16-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11FAKE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX11FAKE16-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX11FAKE16-NEXT: v_add_f32_e32 v28, v48, v39 ; GFX11FAKE16-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33 -; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 ; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 ; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 -; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -14058,19 +14159,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v147, v148 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add_f32_e32 v17, v31, v17 ; GFX11FAKE16-NEXT: v_add_f32_e32 v15, v15, v18 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1 @@ -16481,11 +16583,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fmul_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -16498,6 +16595,10 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -16520,11 +16621,9 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -16547,6 +16646,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 @@ -16559,6 +16660,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v16 +; GCN-NEXT: v_mul_f32_e32 v14, v14, v27 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -16574,7 +16676,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -16584,22 +16686,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -16618,16 +16720,16 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -16650,8 +16752,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 @@ -16670,7 +16772,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -18042,329 +18144,329 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-LABEL: v_fmul_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_mul_f32_e32 v30, v14, v30 +; GFX8-NEXT: s_movk_i32 s6, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s6, v14 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX8-NEXT: v_bfe_u32 v32, v30, 16, 1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 -; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX8-NEXT: v_mul_f32_e32 v33, v30, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_mul_f32_e32 v29, v13, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30 -; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; GFX8-NEXT: v_mul_f32_e32 v31, v31, v32 +; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; GFX8-NEXT: v_mul_f32_e32 v32, v15, v32 +; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v31 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v31, v32, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v32 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s6, v31 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 -; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v33 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 -; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28 -; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v28, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27 -; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v27, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26 -; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v26, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 -; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25 -; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v25, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v25 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24 -; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v24, v8, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v24 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 -; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23 -; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v23, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v23 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22 -; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v22, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v22 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 -; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_mul_f32_e32 v21, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v21 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v21 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s6, v20 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_mul_f32_e32 v4, v33, v4 +; GFX8-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_mul_f32_e32 v3, v33, v3 +; GFX8-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_mul_f32_e32 v2, v33, v2 +; GFX8-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_mul_f32_e32 v1, v33, v1 +; GFX8-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v0 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], s6, v1 +; GFX8-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v16, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX8-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX8-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX8-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX8-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v21, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v20, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v18, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v17, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_fmul_v32bf16: @@ -18375,277 +18477,277 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX900-NEXT: v_mul_f32_e32 v31, v32, v31 ; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_mul_f32_e32 v30, v14, v30 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX900-NEXT: v_mul_f32_e32 v30, v32, v30 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_add3_u32 v14, v32, v31, s4 ; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc ; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX900-NEXT: v_mul_f32_e32 v32, v32, v29 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX900-NEXT: v_mul_f32_e32 v33, v30, v33 +; GFX900-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_mul_f32_e32 v29, v13, v29 +; GFX900-NEXT: v_add3_u32 v13, v34, v33, s4 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc +; GFX900-NEXT: v_add3_u32 v31, v32, v29, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; GFX900-NEXT: v_mul_f32_e32 v32, v29, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v31, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX900-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: v_mul_f32_e32 v28, v12, v28 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX900-NEXT: v_mul_f32_e32 v33, v33, v34 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_mul_f32_e32 v29, v15, v29 -; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; GFX900-NEXT: v_mul_f32_e32 v31, v31, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GFX900-NEXT: v_mul_f32_e32 v33, v15, v33 +; GFX900-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v31, v33, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v31, v31, v33, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc ; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX900-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX900-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v27, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX900-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v26, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX900-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v25, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX900-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v24, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX900-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v23, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX900-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v22, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX900-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX900-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_mul_f32_e32 v5, v33, v5 +; GFX900-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 ; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX900-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_mul_f32_e32 v4, v33, v4 +; GFX900-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX900-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_mul_f32_e32 v3, v33, v3 +; GFX900-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX900-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_mul_f32_e32 v2, v33, v2 +; GFX900-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX900-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_mul_f32_e32 v1, v33, v1 +; GFX900-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX900-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v33, vcc +; GFX900-NEXT: v_perm_b32 v0, v0, v16, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v17, s4 +; GFX900-NEXT: v_perm_b32 v2, v3, v18, s4 +; GFX900-NEXT: v_perm_b32 v3, v4, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v5, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v21, v32, s4 +; GFX900-NEXT: v_perm_b32 v6, v22, v6, s4 +; GFX900-NEXT: v_perm_b32 v7, v23, v7, s4 +; GFX900-NEXT: v_perm_b32 v8, v24, v8, s4 +; GFX900-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX900-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX900-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX900-NEXT: v_perm_b32 v12, v28, v12, s4 +; GFX900-NEXT: v_perm_b32 v13, v29, v13, s4 +; GFX900-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v15, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmul_v32bf16: @@ -18779,272 +18881,272 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29 ; GFX10-NEXT: v_mul_f32_e32 v30, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v33, v33, v14 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff -; GFX10-NEXT: v_mul_f32_e32 v35, v13, v29 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; GFX10-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v32, v33, v32 +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v31 +; GFX10-NEXT: v_mul_f32_e32 v37, v13, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v34, v36, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_mul_f32_e32 v33, v12, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX10-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_mul_f32_e32 v48, v14, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; GFX10-NEXT: v_mul_f32_e32 v28, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_mul_f32_e32 v35, v36, v12 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; GFX10-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX10-NEXT: v_mul_f32_e32 v54, v13, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_mul_f32_e32 v34, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_mul_f32_e32 v33, v36, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GFX10-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v11, v11, v12 +; GFX10-NEXT: v_add3_u32 v12, v34, v31, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v34, v27, v13 +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX10-NEXT: v_bfe_u32 v50, v48, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v12, v29, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v33, v30, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v52, v28, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v28 +; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v27, v12, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_mul_f32_e32 v35, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_mul_f32_e32 v34, v36, v10 -; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_mul_f32_e32 v10, v10, v12 +; GFX10-NEXT: v_add3_u32 v12, v36, v32, 0x7fff +; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v35, v35, v26 +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v30, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v12, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v39, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GFX10-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v12, v49, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_mul_f32_e32 v33, v36, v33 +; GFX10-NEXT: v_bfe_u32 v36, v10, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v9, v9, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v48, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v39, v39, v25 +; GFX10-NEXT: v_add3_u32 v25, v52, v28, 0x7fff ; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v55, v54, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v51, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_mul_f32_e32 v24, v35, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v37, v35, 16, 1 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_add3_u32 v23, v30, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v31, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; GFX10-NEXT: v_bfe_u32 v49, v9, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_mul_f32_e32 v8, v34, v8 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v33, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v30, v30, v33 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc_lo +; GFX10-NEXT: v_add3_u32 v33, v36, v10, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v31, v8, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v51, 16, 1 +; GFX10-NEXT: v_bfe_u32 v32, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v10, v30, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v37, v37, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo -; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX10-NEXT: v_add3_u32 v22, v49, v9, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v8 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v28, v39, 0x7fff +; GFX10-NEXT: v_bfe_u32 v28, v6, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v35, v49, v35 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_add3_u32 v21, v31, v8, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff -; GFX10-NEXT: v_mul_f32_e32 v35, v38, v37 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21 -; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6 -; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GFX10-NEXT: v_add3_u32 v8, v11, v51, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v51, v51 +; GFX10-NEXT: v_add3_u32 v32, v32, v7, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v20 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v10, v30, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v39 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v39, v39 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v6 +; GFX10-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v28, v28, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX10-NEXT: v_mul_f32_e32 v10, v10, v51 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35 -; GFX10-NEXT: v_mul_f32_e32 v8, v21, v8 -; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v30, v30 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5 -; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v48, v49, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4 -; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v49, v51, v49 -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48 +; GFX10-NEXT: v_add3_u32 v20, v49, v35, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v10, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v6, v51, v6 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v4, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5 +; GFX10-NEXT: v_add3_u32 v5, v49, v10, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v51, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v4, v4 +; GFX10-NEXT: v_add3_u32 v4, v49, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff -; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v36, s8 +; GFX10-NEXT: v_mul_f32_e32 v6, v54, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v38, s9 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8 -; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v39, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v31, s11 +; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v30, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v35, s13 +; GFX10-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v51, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v52, s6 +; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10 -; GFX10-NEXT: v_mul_f32_e32 v49, v52, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v34, s7 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v6, v18, v6 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49 -; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1 -; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13 -; GFX10-NEXT: v_mul_f32_e32 v17, v49, v17 +; GFX10-NEXT: v_bfe_u32 v36, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v37, v48, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo -; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17 -; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v30, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v51, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v30, v30, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v48, v3, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0 -; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo +; GFX10-NEXT: v_add3_u32 v51, v51, v0, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v30, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v50, s4 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v48, v48, v3, 0x7fff +; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v51, v49, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14 -; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v53, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v54, s16 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v52, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302 +; GFX10-NEXT: v_perm_b32 v12, v25, v12, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v26, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v27, v14, 0x7060302 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo -; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v48, v50, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v10, v5, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v11, v20, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_mul_f32_e32 v17, v33, v8 -; GFX10-NEXT: v_mul_f32_e32 v15, v15, v16 -; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302 -; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_mul_f32_e32 v17, v37, v6 +; GFX10-NEXT: v_mul_f32_e32 v11, v15, v16 +; GFX10-NEXT: v_perm_b32 v6, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v32, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v21, v9, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX10-NEXT: v_perm_b32 v9, v22, v18, 0x7060302 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15 -; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX10-NEXT: v_add3_u32 v15, v15, v11, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v33, v23, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_perm_b32 v11, v29, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc_lo ; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -19052,302 +19154,293 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11TRUE16-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v18, v84, v83 :: v_dual_mul_f32 v9, v9, v25 -; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v17, v86, v85 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 ; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 -; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 ; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 -; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v49, 0xffff0000, v26 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v24, v64, v55 ; GFX11TRUE16-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v23, v66, v65 +; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 ; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v20, v80, v71 -; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v26, v52, v51 :: v_dual_mul_f32 v25, v54, v53 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_and_b32 v51, 0xffff0000, v25 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v52, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v20, v80, v71 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v2, v2, v18 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v18, v84, v83 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v25, v54, v53 +; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v27, v50, v49 +; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11TRUE16-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v21, v70, v69 +; GFX11TRUE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v13, 16, v13 ; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v22, v68, v67 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v26, v52, v51 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v14, 16, v14 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30 ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GFX11TRUE16-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX11TRUE16-NEXT: v_mul_f32_e32 v28, v48, v39 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33 -; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 +; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 +; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 ; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 ; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 +; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 +; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 -; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 -; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 -; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 -; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25 +; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 +; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v148, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mul_f32_e32 v15, v15, v33 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mul_f32_e32 v17, v31, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17 ; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19355,219 +19448,218 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v86, 16, v0 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v17, v86, v85 ; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v24, v64, v55 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11FAKE16-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX11FAKE16-NEXT: v_mul_f32_e32 v20, v80, v71 +; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v20, v80, v71 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v19, v82, v81 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v18, v84, v83 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX11FAKE16-NEXT: v_mul_f32_e32 v21, v70, v69 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v26, v52, v51 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX11FAKE16-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX11FAKE16-NEXT: v_mul_f32_e32 v28, v48, v39 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33 -; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 ; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 ; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 -; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -19600,19 +19692,20 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v147, v148 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_mul_f32_e32 v17, v31, v17 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v15, v15, v18 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1 @@ -21519,11 +21612,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_minnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_min_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -21536,6 +21624,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -21558,11 +21650,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_min_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -21585,6 +21675,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 @@ -21597,6 +21689,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 +; GCN-NEXT: v_min_f32_e32 v14, v14, v27 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -21612,7 +21705,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -21622,22 +21715,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -21656,16 +21749,16 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -21688,8 +21781,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 @@ -21708,7 +21801,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -23080,329 +23173,329 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-LABEL: v_minnum_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX8-NEXT: v_min_f32_e32 v31, v32, v31 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_min_f32_e32 v30, v14, v30 +; GFX8-NEXT: s_movk_i32 s6, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s6, v14 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX8-NEXT: v_bfe_u32 v32, v30, 16, 1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 -; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX8-NEXT: v_min_f32_e32 v32, v32, v30 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX8-NEXT: v_min_f32_e32 v33, v30, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_min_f32_e32 v29, v13, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX8-NEXT: v_min_f32_e32 v33, v33, v34 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX8-NEXT: v_min_f32_e32 v30, v15, v30 -; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; GFX8-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; GFX8-NEXT: v_min_f32_e32 v32, v15, v32 +; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v31 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v31, v32, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v32 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s6, v31 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 -; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX8-NEXT: v_min_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v33 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 -; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX8-NEXT: v_min_f32_e32 v28, v33, v28 -; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v28, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX8-NEXT: v_min_f32_e32 v27, v33, v27 -; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v27, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX8-NEXT: v_min_f32_e32 v26, v33, v26 -; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v26, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 -; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX8-NEXT: v_min_f32_e32 v25, v33, v25 -; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v25, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v25 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX8-NEXT: v_min_f32_e32 v24, v33, v24 -; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v24, v8, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v24 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 -; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX8-NEXT: v_min_f32_e32 v23, v33, v23 -; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v23, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v23 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX8-NEXT: v_min_f32_e32 v22, v33, v22 -; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v22, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v22 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 -; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX8-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_min_f32_e32 v21, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v21 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v21 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s6, v20 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX8-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_min_f32_e32 v4, v33, v4 +; GFX8-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX8-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_min_f32_e32 v3, v33, v3 +; GFX8-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_min_f32_e32 v2, v33, v2 +; GFX8-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX8-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_min_f32_e32 v1, v33, v1 +; GFX8-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v0 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], s6, v1 +; GFX8-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v16, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX8-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX8-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX8-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX8-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v21, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v20, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v18, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v17, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minnum_v32bf16: @@ -23413,277 +23506,277 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX900-NEXT: v_min_f32_e32 v31, v32, v31 ; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_min_f32_e32 v30, v14, v30 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX900-NEXT: v_min_f32_e32 v30, v32, v30 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_add3_u32 v14, v32, v31, s4 ; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc ; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX900-NEXT: v_min_f32_e32 v32, v32, v29 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX900-NEXT: v_min_f32_e32 v33, v30, v33 +; GFX900-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_min_f32_e32 v29, v13, v29 +; GFX900-NEXT: v_add3_u32 v13, v34, v33, s4 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc +; GFX900-NEXT: v_add3_u32 v31, v32, v29, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; GFX900-NEXT: v_min_f32_e32 v32, v29, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v31, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: v_min_f32_e32 v28, v12, v28 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX900-NEXT: v_min_f32_e32 v33, v33, v34 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_min_f32_e32 v29, v15, v29 -; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; GFX900-NEXT: v_min_f32_e32 v31, v31, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GFX900-NEXT: v_min_f32_e32 v33, v15, v33 +; GFX900-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v31, v33, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v31, v31, v33, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc ; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX900-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX900-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v27, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX900-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v26, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX900-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v25, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX900-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v24, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX900-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v23, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX900-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v22, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX900-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_min_f32_e32 v5, v33, v5 +; GFX900-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 ; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX900-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_min_f32_e32 v4, v33, v4 +; GFX900-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX900-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_min_f32_e32 v3, v33, v3 +; GFX900-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX900-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_min_f32_e32 v2, v33, v2 +; GFX900-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX900-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_min_f32_e32 v1, v33, v1 +; GFX900-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v33, vcc +; GFX900-NEXT: v_perm_b32 v0, v0, v16, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v17, s4 +; GFX900-NEXT: v_perm_b32 v2, v3, v18, s4 +; GFX900-NEXT: v_perm_b32 v3, v4, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v5, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v21, v32, s4 +; GFX900-NEXT: v_perm_b32 v6, v22, v6, s4 +; GFX900-NEXT: v_perm_b32 v7, v23, v7, s4 +; GFX900-NEXT: v_perm_b32 v8, v24, v8, s4 +; GFX900-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX900-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX900-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX900-NEXT: v_perm_b32 v12, v28, v12, s4 +; GFX900-NEXT: v_perm_b32 v13, v29, v13, s4 +; GFX900-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v15, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minnum_v32bf16: @@ -23817,272 +23910,272 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29 ; GFX10-NEXT: v_min_f32_e32 v30, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v33, v33, v14 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff -; GFX10-NEXT: v_min_f32_e32 v35, v13, v29 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; GFX10-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v31 +; GFX10-NEXT: v_min_f32_e32 v37, v13, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v34, v36, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_min_f32_e32 v33, v12, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX10-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_min_f32_e32 v48, v14, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; GFX10-NEXT: v_min_f32_e32 v28, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_min_f32_e32 v35, v36, v12 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; GFX10-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX10-NEXT: v_min_f32_e32 v54, v13, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_min_f32_e32 v34, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_min_f32_e32 v33, v36, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GFX10-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v11, v11, v12 +; GFX10-NEXT: v_add3_u32 v12, v34, v31, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v34, v27, v13 +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX10-NEXT: v_bfe_u32 v50, v48, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v12, v29, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v33, v30, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v52, v28, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v28 +; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v27, v12, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_min_f32_e32 v35, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_min_f32_e32 v34, v36, v10 -; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_min_f32_e32 v10, v10, v12 +; GFX10-NEXT: v_add3_u32 v12, v36, v32, 0x7fff +; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v35, v35, v26 +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v30, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v12, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v39, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GFX10-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v12, v49, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_min_f32_e32 v33, v36, v33 +; GFX10-NEXT: v_bfe_u32 v36, v10, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v9, v9, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v48, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v39, v39, v25 +; GFX10-NEXT: v_add3_u32 v25, v52, v28, 0x7fff ; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v55, v54, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v51, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_min_f32_e32 v24, v35, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v37, v35, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_add3_u32 v23, v30, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v31, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; GFX10-NEXT: v_bfe_u32 v49, v9, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v51, v52, v51 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v8, v34, v8 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v33, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v30, v30, v33 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc_lo +; GFX10-NEXT: v_add3_u32 v33, v36, v10, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v31, v8, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v51, 16, 1 +; GFX10-NEXT: v_bfe_u32 v32, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v10, v30, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v37, v37, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo -; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX10-NEXT: v_add3_u32 v22, v49, v9, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v8 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v28, v39, 0x7fff +; GFX10-NEXT: v_bfe_u32 v28, v6, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v35, v49, v35 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_add3_u32 v21, v31, v8, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff -; GFX10-NEXT: v_min_f32_e32 v35, v38, v37 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21 -; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6 -; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GFX10-NEXT: v_add3_u32 v8, v11, v51, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v51, v51 +; GFX10-NEXT: v_add3_u32 v32, v32, v7, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v20 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v10, v30, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v39 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v39, v39 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v6 +; GFX10-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v28, v28, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX10-NEXT: v_min_f32_e32 v10, v10, v51 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35 -; GFX10-NEXT: v_min_f32_e32 v8, v21, v8 -; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v30, v30 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5 -; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v48, v49, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4 -; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v49, v51, v49 -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48 +; GFX10-NEXT: v_add3_u32 v20, v49, v35, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v10, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v6, v51, v6 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v4, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5 +; GFX10-NEXT: v_add3_u32 v5, v49, v10, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v51, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v4, v4 +; GFX10-NEXT: v_add3_u32 v4, v49, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff -; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v36, s8 +; GFX10-NEXT: v_min_f32_e32 v6, v54, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v38, s9 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8 -; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v39, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v31, s11 +; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v30, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v35, s13 +; GFX10-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v51, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v52, s6 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10 -; GFX10-NEXT: v_min_f32_e32 v49, v52, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v34, s7 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v6, v18, v6 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49 -; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1 -; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13 -; GFX10-NEXT: v_min_f32_e32 v17, v49, v17 +; GFX10-NEXT: v_bfe_u32 v36, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX10-NEXT: v_min_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v37, v48, vcc_lo ; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo -; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17 -; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v30, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v51, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v30, v30, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v48, v3, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0 -; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo +; GFX10-NEXT: v_add3_u32 v51, v51, v0, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v30, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v50, s4 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v48, v48, v3, 0x7fff +; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v51, v49, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14 -; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v53, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v54, s16 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v52, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302 +; GFX10-NEXT: v_perm_b32 v12, v25, v12, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v26, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v27, v14, 0x7060302 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo -; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v48, v50, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v10, v5, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v11, v20, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_min_f32_e32 v17, v33, v8 -; GFX10-NEXT: v_min_f32_e32 v15, v15, v16 -; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302 -; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_min_f32_e32 v17, v37, v6 +; GFX10-NEXT: v_min_f32_e32 v11, v15, v16 +; GFX10-NEXT: v_perm_b32 v6, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v32, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v21, v9, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX10-NEXT: v_perm_b32 v9, v22, v18, 0x7060302 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15 -; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX10-NEXT: v_add3_u32 v15, v15, v11, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v33, v23, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_perm_b32 v11, v29, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc_lo ; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -24090,302 +24183,293 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11TRUE16-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 -; GFX11TRUE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11TRUE16-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11TRUE16-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX11TRUE16-NEXT: v_dual_min_f32 v18, v84, v83 :: v_dual_min_f32 v9, v9, v25 -; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11TRUE16-NEXT: v_min_f32_e32 v17, v86, v85 -; GFX11TRUE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 ; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 -; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 ; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 -; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX11TRUE16-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v49, 0xffff0000, v26 +; GFX11TRUE16-NEXT: v_min_f32_e32 v24, v64, v55 ; GFX11TRUE16-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX11TRUE16-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_min_f32_e32 v23, v66, v65 +; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 ; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_min_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX11TRUE16-NEXT: v_min_f32_e32 v20, v80, v71 -; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11TRUE16-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11TRUE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27 -; GFX11TRUE16-NEXT: v_dual_min_f32 v26, v52, v51 :: v_dual_min_f32 v25, v54, v53 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_and_b32 v51, 0xffff0000, v25 +; GFX11TRUE16-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v52, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_dual_min_f32 v20, v80, v71 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_dual_min_f32 v2, v2, v18 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_min_f32_e32 v18, v84, v83 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11TRUE16-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11TRUE16-NEXT: v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11TRUE16-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11TRUE16-NEXT: v_min_f32_e32 v27, v50, v49 +; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11TRUE16-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX11TRUE16-NEXT: v_min_f32_e32 v21, v70, v69 +; GFX11TRUE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v13, 16, v13 ; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11TRUE16-NEXT: v_min_f32_e32 v22, v68, v67 -; GFX11TRUE16-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11TRUE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_min_f32_e32 v26, v52, v51 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v14, 16, v14 ; GFX11TRUE16-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30 ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GFX11TRUE16-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX11TRUE16-NEXT: v_min_f32_e32 v28, v48, v39 ; GFX11TRUE16-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33 -; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 +; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 +; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 ; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 ; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 +; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 +; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 -; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 -; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 -; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 -; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25 +; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 +; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v148, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_min_f32_e32 v15, v15, v33 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_min_f32_e32 v17, v31, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17 ; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24393,219 +24477,218 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26 -; GFX11FAKE16-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v86, 16, v0 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_min_f32_e32 v17, v86, v85 ; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11FAKE16-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11FAKE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11FAKE16-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX11FAKE16-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX11FAKE16-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11FAKE16-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX11FAKE16-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83 -; GFX11FAKE16-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11FAKE16-NEXT: v_min_f32_e32 v24, v64, v55 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11FAKE16-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX11FAKE16-NEXT: v_min_f32_e32 v20, v80, v71 +; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX11FAKE16-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX11FAKE16-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_dual_min_f32 v20, v80, v71 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11FAKE16-NEXT: v_min_f32_e32 v19, v82, v81 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX11FAKE16-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11FAKE16-NEXT: v_min_f32_e32 v18, v84, v83 +; GFX11FAKE16-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX11FAKE16-NEXT: v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11FAKE16-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11FAKE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11FAKE16-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX11FAKE16-NEXT: v_min_f32_e32 v21, v70, v69 +; GFX11FAKE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX11FAKE16-NEXT: v_min_f32_e32 v26, v52, v51 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11FAKE16-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11FAKE16-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11FAKE16-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11FAKE16-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11FAKE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX11FAKE16-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX11FAKE16-NEXT: v_min_f32_e32 v28, v48, v39 ; GFX11FAKE16-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33 -; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 ; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 ; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 -; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -24638,19 +24721,20 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v147, v148 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_min_f32_e32 v17, v31, v17 ; GFX11FAKE16-NEXT: v_min_f32_e32 v15, v15, v18 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1 @@ -26043,11 +26127,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_max_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -26060,6 +26139,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -26082,11 +26165,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_max_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -26109,6 +26190,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 @@ -26121,6 +26204,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 +; GCN-NEXT: v_max_f32_e32 v14, v14, v27 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -26136,7 +26220,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -26146,22 +26230,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -26180,16 +26264,16 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -26212,8 +26296,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 @@ -26232,7 +26316,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -27604,329 +27688,329 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-LABEL: v_maxnum_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX8-NEXT: v_max_f32_e32 v31, v32, v31 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_max_f32_e32 v30, v14, v30 +; GFX8-NEXT: s_movk_i32 s6, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s6, v14 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX8-NEXT: v_bfe_u32 v32, v30, 16, 1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 -; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX8-NEXT: v_max_f32_e32 v32, v32, v30 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX8-NEXT: v_max_f32_e32 v33, v30, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_max_f32_e32 v29, v13, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX8-NEXT: v_max_f32_e32 v33, v33, v34 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX8-NEXT: v_max_f32_e32 v30, v15, v30 -; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; GFX8-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; GFX8-NEXT: v_max_f32_e32 v32, v15, v32 +; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v31 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v31, v32, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v32 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s6, v31 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 -; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX8-NEXT: v_max_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v33 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v33, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 -; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX8-NEXT: v_max_f32_e32 v28, v33, v28 -; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v28, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX8-NEXT: v_max_f32_e32 v27, v33, v27 -; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v27, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX8-NEXT: v_max_f32_e32 v26, v33, v26 -; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v26, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 -; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX8-NEXT: v_max_f32_e32 v25, v33, v25 -; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v25, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v25 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX8-NEXT: v_max_f32_e32 v24, v33, v24 -; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v24, v8, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v24 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 -; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX8-NEXT: v_max_f32_e32 v23, v33, v23 -; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v23, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v23 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX8-NEXT: v_max_f32_e32 v22, v33, v22 -; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v22, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v22 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 -; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX8-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_max_f32_e32 v21, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v21 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v21 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v34 +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s6, v20 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX8-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_max_f32_e32 v4, v33, v4 +; GFX8-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX8-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_max_f32_e32 v3, v33, v3 +; GFX8-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_max_f32_e32 v2, v33, v2 +; GFX8-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX8-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_max_f32_e32 v1, v33, v1 +; GFX8-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v0 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], s6, v1 +; GFX8-NEXT: v_or_b32_e32 v16, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v16, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX8-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX8-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX8-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX8-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v21, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v20, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v18, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v17, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maxnum_v32bf16: @@ -27937,277 +28021,277 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX900-NEXT: v_max_f32_e32 v31, v32, v31 ; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_max_f32_e32 v30, v14, v30 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX900-NEXT: v_max_f32_e32 v30, v32, v30 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_add3_u32 v14, v32, v31, s4 ; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v33, vcc ; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v29 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX900-NEXT: v_max_f32_e32 v32, v32, v29 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; GFX900-NEXT: v_max_f32_e32 v33, v30, v33 +; GFX900-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_max_f32_e32 v29, v13, v29 +; GFX900-NEXT: v_add3_u32 v13, v34, v33, s4 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc +; GFX900-NEXT: v_add3_u32 v31, v32, v29, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; GFX900-NEXT: v_max_f32_e32 v32, v29, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v31, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: v_max_f32_e32 v28, v12, v28 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX900-NEXT: v_max_f32_e32 v33, v33, v34 -; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX900-NEXT: v_max_f32_e32 v29, v15, v29 -; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; GFX900-NEXT: v_max_f32_e32 v31, v31, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GFX900-NEXT: v_max_f32_e32 v33, v15, v33 +; GFX900-NEXT: v_bfe_u32 v15, v31, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v31, v33, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v31, v31, v33, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc ; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 ; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX900-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX900-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v27, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX900-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v26, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX900-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v25, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX900-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v24, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX900-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v23, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX900-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v22, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v34 ; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX900-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_max_f32_e32 v5, v33, v5 +; GFX900-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 ; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX900-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_max_f32_e32 v4, v33, v4 +; GFX900-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX900-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_max_f32_e32 v3, v33, v3 +; GFX900-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX900-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_max_f32_e32 v2, v33, v2 +; GFX900-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v33, vcc ; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX900-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_max_f32_e32 v1, v33, v1 +; GFX900-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v33, vcc +; GFX900-NEXT: v_perm_b32 v0, v0, v16, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v17, s4 +; GFX900-NEXT: v_perm_b32 v2, v3, v18, s4 +; GFX900-NEXT: v_perm_b32 v3, v4, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v5, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v21, v32, s4 +; GFX900-NEXT: v_perm_b32 v6, v22, v6, s4 +; GFX900-NEXT: v_perm_b32 v7, v23, v7, s4 +; GFX900-NEXT: v_perm_b32 v8, v24, v8, s4 +; GFX900-NEXT: v_perm_b32 v9, v25, v9, s4 +; GFX900-NEXT: v_perm_b32 v10, v26, v10, s4 +; GFX900-NEXT: v_perm_b32 v11, v27, v11, s4 +; GFX900-NEXT: v_perm_b32 v12, v28, v12, s4 +; GFX900-NEXT: v_perm_b32 v13, v29, v13, s4 +; GFX900-NEXT: v_perm_b32 v14, v30, v14, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v15, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maxnum_v32bf16: @@ -28341,272 +28425,272 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29 ; GFX10-NEXT: v_max_f32_e32 v30, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v33, v33, v14 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff -; GFX10-NEXT: v_max_f32_e32 v35, v13, v29 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; GFX10-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v31 +; GFX10-NEXT: v_max_f32_e32 v37, v13, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v34, v36, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_max_f32_e32 v33, v12, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX10-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_max_f32_e32 v48, v14, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; GFX10-NEXT: v_max_f32_e32 v28, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_max_f32_e32 v35, v36, v12 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; GFX10-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX10-NEXT: v_max_f32_e32 v54, v13, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_max_f32_e32 v34, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_max_f32_e32 v33, v36, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GFX10-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v11, v11, v12 +; GFX10-NEXT: v_add3_u32 v12, v34, v31, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v34, v27, v13 +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX10-NEXT: v_bfe_u32 v50, v48, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v12, v29, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v33, v30, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v52, v28, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v28 +; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v27, v12, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_max_f32_e32 v35, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_max_f32_e32 v34, v36, v10 -; GFX10-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_max_f32_e32 v10, v10, v12 +; GFX10-NEXT: v_add3_u32 v12, v36, v32, 0x7fff +; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v35, v35, v26 +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v30, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v12, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v12, v39, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GFX10-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v12, v49, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 -; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX10-NEXT: v_max_f32_e32 v33, v36, v33 +; GFX10-NEXT: v_bfe_u32 v36, v10, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v9, v9, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v48, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v39, v39, v25 +; GFX10-NEXT: v_add3_u32 v25, v52, v28, 0x7fff ; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v55, v54, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v51, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_max_f32_e32 v24, v35, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v37, v35, 16, 1 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo -; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_add3_u32 v23, v30, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v31, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; GFX10-NEXT: v_bfe_u32 v49, v9, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v51, v52, v51 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v8, v34, v8 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v33, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v28, v39, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v30, v30, v33 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc_lo +; GFX10-NEXT: v_add3_u32 v33, v36, v10, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v31, v8, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v51, 16, 1 +; GFX10-NEXT: v_bfe_u32 v32, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v10, v30, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX10-NEXT: v_add3_u32 v37, v37, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo -; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX10-NEXT: v_add3_u32 v22, v49, v9, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v8 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v28, v39, 0x7fff +; GFX10-NEXT: v_bfe_u32 v28, v6, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v35, v49, v35 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_add3_u32 v21, v31, v8, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff -; GFX10-NEXT: v_max_f32_e32 v35, v38, v37 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21 -; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6 -; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GFX10-NEXT: v_add3_u32 v8, v11, v51, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v51, v51 +; GFX10-NEXT: v_add3_u32 v32, v32, v7, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v20 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v10, v30, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v39 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v39, v39 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v6 +; GFX10-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v28, v28, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX10-NEXT: v_max_f32_e32 v10, v10, v51 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35 -; GFX10-NEXT: v_max_f32_e32 v8, v21, v8 -; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v30, v30 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5 -; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v48, v49, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4 -; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v49, v51, v49 -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48 +; GFX10-NEXT: v_add3_u32 v20, v49, v35, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v10, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v6, v51, v6 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v4, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v5, v5 +; GFX10-NEXT: v_add3_u32 v5, v49, v10, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v51, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v4, v4 +; GFX10-NEXT: v_add3_u32 v4, v49, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff -; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v36, s8 +; GFX10-NEXT: v_max_f32_e32 v6, v54, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v38, s9 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8 -; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v39, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v31, s11 +; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v30, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v35, s13 +; GFX10-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v51, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v52, s6 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10 -; GFX10-NEXT: v_max_f32_e32 v49, v52, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v49, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v34, s7 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v6, v18, v6 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49 -; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1 -; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13 -; GFX10-NEXT: v_max_f32_e32 v17, v49, v17 +; GFX10-NEXT: v_bfe_u32 v36, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX10-NEXT: v_max_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v37, v48, vcc_lo ; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo -; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17 -; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v30, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v51, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v30, v30, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v48, v3, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0 -; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo +; GFX10-NEXT: v_add3_u32 v51, v51, v0, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v30, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7 -; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v50, s4 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v48, v48, v3, 0x7fff +; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v51, v49, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14 -; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v53, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v54, s16 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v52, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302 +; GFX10-NEXT: v_perm_b32 v12, v25, v12, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v26, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v27, v14, 0x7060302 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo -; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v48, v50, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v10, v5, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v11, v20, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_max_f32_e32 v17, v33, v8 -; GFX10-NEXT: v_max_f32_e32 v15, v15, v16 -; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302 -; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_max_f32_e32 v17, v37, v6 +; GFX10-NEXT: v_max_f32_e32 v11, v15, v16 +; GFX10-NEXT: v_perm_b32 v6, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v32, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v21, v9, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX10-NEXT: v_perm_b32 v9, v22, v18, 0x7060302 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15 -; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX10-NEXT: v_add3_u32 v15, v15, v11, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v33, v23, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_perm_b32 v11, v29, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc_lo ; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -28614,302 +28698,293 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11TRUE16-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 -; GFX11TRUE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11TRUE16-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11TRUE16-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX11TRUE16-NEXT: v_dual_max_f32 v18, v84, v83 :: v_dual_max_f32 v9, v9, v25 -; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11TRUE16-NEXT: v_max_f32_e32 v17, v86, v85 -; GFX11TRUE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22 ; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 -; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27 ; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 -; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX11TRUE16-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v49, 0xffff0000, v26 +; GFX11TRUE16-NEXT: v_max_f32_e32 v24, v64, v55 ; GFX11TRUE16-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX11TRUE16-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_max_f32_e32 v23, v66, v65 +; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 ; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_max_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX11TRUE16-NEXT: v_max_f32_e32 v20, v80, v71 -; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11TRUE16-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11TRUE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27 -; GFX11TRUE16-NEXT: v_dual_max_f32 v26, v52, v51 :: v_dual_max_f32 v25, v54, v53 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_and_b32 v51, 0xffff0000, v25 +; GFX11TRUE16-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v52, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX11TRUE16-NEXT: v_dual_max_f32 v20, v80, v71 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_dual_max_f32 v2, v2, v18 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_max_f32_e32 v18, v84, v83 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_max_f32_e32 v25, v54, v53 +; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11TRUE16-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11TRUE16-NEXT: v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11TRUE16-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11TRUE16-NEXT: v_max_f32_e32 v27, v50, v49 +; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11TRUE16-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX11TRUE16-NEXT: v_max_f32_e32 v21, v70, v69 +; GFX11TRUE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v13, 16, v13 ; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11TRUE16-NEXT: v_max_f32_e32 v22, v68, v67 -; GFX11TRUE16-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11TRUE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_max_f32_e32 v26, v52, v51 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v14, 16, v14 ; GFX11TRUE16-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30 ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GFX11TRUE16-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX11TRUE16-NEXT: v_max_f32_e32 v28, v48, v39 ; GFX11TRUE16-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33 -; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 +; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 +; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 ; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 ; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 ; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 +; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 +; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 -; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 -; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 -; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 +; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 -; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26 ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25 +; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28 +; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29 +; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v148, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_max_f32_e32 v15, v15, v33 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_max_f32_e32 v17, v31, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17 ; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -28917,219 +28992,218 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16: ; %bb.0: ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26 -; GFX11FAKE16-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v86, 16, v0 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_max_f32_e32 v17, v86, v85 ; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1 -; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11FAKE16-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v8 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11FAKE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 -; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17 -; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11FAKE16-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX11FAKE16-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11FAKE16-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX11FAKE16-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83 -; GFX11FAKE16-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11FAKE16-NEXT: v_max_f32_e32 v24, v64, v55 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 ; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24 ; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23 -; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7 -; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11FAKE16-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX11FAKE16-NEXT: v_max_f32_e32 v20, v80, v71 +; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX11FAKE16-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX11FAKE16-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11FAKE16-NEXT: v_dual_max_f32 v20, v80, v71 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11FAKE16-NEXT: v_max_f32_e32 v19, v82, v81 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX11FAKE16-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11FAKE16-NEXT: v_max_f32_e32 v18, v84, v83 +; GFX11FAKE16-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX11FAKE16-NEXT: v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 +; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff ; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11FAKE16-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11FAKE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11FAKE16-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX11FAKE16-NEXT: v_max_f32_e32 v21, v70, v69 +; GFX11FAKE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX11FAKE16-NEXT: v_max_f32_e32 v26, v52, v51 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11FAKE16-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11FAKE16-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11FAKE16-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 -; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11FAKE16-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11FAKE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX11FAKE16-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11FAKE16-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX11FAKE16-NEXT: v_max_f32_e32 v28, v48, v39 ; GFX11FAKE16-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33 -; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13 -; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29 -; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26 -; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25 ; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22 -; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21 ; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1 ; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19 ; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18 -; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v148, 0x400000, v0 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff ; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff ; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -29162,19 +29236,20 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v147, v148 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_max_f32_e32 v17, v31, v17 ; GFX11FAKE16-NEXT: v_max_f32_e32 v15, v15, v18 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1 @@ -41565,83 +41640,83 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GCN-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v6, v18, v20, 16 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v8, v21, v22, 16 ; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v10, v23, v24, 16 ; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v12, v25, v26, 16 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v14, v27, v28, 16 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 @@ -41656,72 +41731,72 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v18, v20, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v23 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v18, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v25 ; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v18, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v18, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -41730,6 +41805,8 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 @@ -41740,11 +41817,9 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GFX7-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -41838,101 +41913,101 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v6, v1, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v5, v1, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v7, 16 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_alignbit_b32 v13, v13, v15, 16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_alignbit_b32 v15, v15, v21, 16 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; GCN-NEXT: s_waitcnt vmcnt(5) ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_alignbit_b32 v18, v18, v20, 16 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v19, v20, v19, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -42028,37 +42103,37 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc -; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v15, vcc ; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc ; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc ; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc ; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc ; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc ; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 +; GCN-NEXT: v_cndmask_b32_e32 v15, v22, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v2, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v4, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; GCN-NEXT: v_cndmask_b32_e32 v12, v18, v6, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; GCN-NEXT: v_cndmask_b32_e32 v13, v19, v5, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GCN-NEXT: v_cndmask_b32_e32 v14, v20, v3, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GCN-NEXT: v_cndmask_b32_e32 v16, v21, v1, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24 @@ -42083,243 +42158,248 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_alignbit_b32 v31, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v9 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v4, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX7-NEXT: v_alignbit_b32 v10, v9, v10, 16 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v12, v11, v12, 16 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_alignbit_b32 v11, v11, v13, 16 +; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_cndmask_b32_e32 v13, v13, v11, vcc +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v15, v14, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v14, v6, v31, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v3, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; GFX7-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v18, v10, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; GFX7-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; GFX7-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; GFX7-NEXT: v_cndmask_b32_e32 v25, v25, v24, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 +; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; GFX7-NEXT: v_cndmask_b32_e32 v27, v27, v26, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_cndmask_b32_e32 v29, v29, v28, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_alignbit_b32 v30, v30, v32, 16 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_cndmask_b32_e32 v32, v32, v30, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v32bf16: @@ -42328,8 +42408,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc @@ -42355,8 +42435,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc ; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc ; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX900-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc @@ -43137,8 +43217,8 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-LABEL: v_vselect_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v7 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 @@ -43149,8 +43229,6 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -43161,6 +43239,8 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -43191,10 +43271,10 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 @@ -43507,106 +43587,105 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v8 ; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v9 -; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v1, 1, v10 -; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v2, 1, v11 -; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v3, 1, v12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v7, 1, v13 -; GCN-NEXT: v_and_b32_e32 v8, 1, v14 -; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_and_b32_e32 v9, 1, v15 -; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_and_b32_e32 v1, 1, v9 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v1 +; GCN-NEXT: v_and_b32_e32 v2, 1, v10 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v2 +; GCN-NEXT: v_and_b32_e32 v3, 1, v11 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3 +; GCN-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v4 +; GCN-NEXT: v_and_b32_e32 v4, 1, v13 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v4 +; GCN-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v4 +; GCN-NEXT: v_and_b32_e32 v4, 1, v15 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v4 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[42:43] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_cndmask_b32_e64 v15, v5, v4, s[42:43] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[40:41] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_cndmask_b32_e64 v14, v5, v4, s[40:41] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_cndmask_b32_e64 v13, v5, v4, s[28:29] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_cndmask_b32_e64 v12, v5, v4, s[26:27] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_cndmask_b32_e64 v10, v5, v4, s[22:23] +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_cndmask_b32_e64 v9, v6, v5, s[20:21] +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19] -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_cndmask_b32_e64 v8, v6, v5, s[18:19] +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17] -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[16:17] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15] -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_cndmask_b32_e64 v23, v5, v4, s[14:15] +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[12:13] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -43626,10 +43705,14 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_and_b32_e32 v0, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v5 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 @@ -43645,199 +43728,195 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v13 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v13 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v14 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v15 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v4 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[42:43] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v30 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[40:41] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[28:29] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v2, v1, s[26:27] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v1, s[24:25] -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v1, v0, s[40:41] +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GFX7-NEXT: v_cndmask_b32_e64 v13, v1, v0, s[28:29] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v1, v0, s[26:27] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v27 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v4, v5, s[20:21] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v3, v5, s[18:19] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v6, v1, s[24:25] +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v26 +; GFX7-NEXT: v_cndmask_b32_e64 v10, v7, v6, s[22:23] +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v25 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v9, v7, v6, s[20:21] +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v24 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v7, v6, s[18:19] +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[14:15] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[16:17] +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v6, s[14:15] +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v16 +; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[6:7] ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v1, v19, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v17, s[4:5] -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[8:9] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v16, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GFX7-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[12:13] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v16bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v4 +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v31, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v31 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v13 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v31, v14, s[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v12 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v11 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v22, v14, s[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v10 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v29, v21, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v9 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v21, v14, s[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v28, v20, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v27 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v20, v14, s[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v6 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v27, v19, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v3, v25, v17, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v9, v0, v23, s[40:41] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v1, s[42:43] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v19, v4, v23, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v4, v15, s[8:9] +; GFX8-NEXT: v_and_b32_e32 v4, 1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v15, v5, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; GFX8-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_vselect_v16bf16: @@ -43846,78 +43925,78 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX900-NEXT: v_and_b32_e32 v6, 1, v8 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 ; GFX900-NEXT: v_and_b32_e32 v6, 1, v10 ; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 ; GFX900-NEXT: v_and_b32_e32 v6, 1, v12 ; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 -; GFX900-NEXT: v_and_b32_e32 v8, 1, v13 -; GFX900-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] -; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 -; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v13 ; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v22 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v30 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v6 ; GFX900-NEXT: v_and_b32_e32 v11, 1, v11 ; GFX900-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e64 v13, v30, v22, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[10:11] +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v20 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; GFX900-NEXT: v_cndmask_b32_e64 v21, v29, v21, s[6:7] +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v9 ; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e32 v12, v27, v19, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v27 ; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX900-NEXT: v_and_b32_e32 v13, 1, v14 -; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] -; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GFX900-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] -; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] -; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX900-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] -; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v28 ; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v19, v29, vcc ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; GFX900-NEXT: v_and_b32_e32 v14, 1, v14 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX900-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v22, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v6, v10, v6, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v8, v23, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v28, vcc ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v17 -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v16 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc ; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 ; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX900-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v7, v12, s4 ; GFX900-NEXT: v_perm_b32 v4, v11, v20, s4 -; GFX900-NEXT: v_perm_b32 v5, v12, v14, s4 -; GFX900-NEXT: v_perm_b32 v7, v7, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v6, v21, s4 +; GFX900-NEXT: v_perm_b32 v6, v10, v13, s4 +; GFX900-NEXT: v_perm_b32 v7, v8, v9, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_vselect_v16bf16: @@ -44010,79 +44089,79 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 1, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v30 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v22, v33, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v25 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v35, v34, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v30, v22, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v51, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v50, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v30, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v53, v52, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v49, v48, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v39, v38, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 ; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v12, v13, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v16bf16: @@ -44561,17 +44640,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; GFX7-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25 ; GFX7-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v30 ; GFX7-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v29 ; GFX7-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v28 ; GFX7-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v27 ; GFX7-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v26 ; GFX7-NEXT: v_and_b32_e32 v23, 1, v23 ; GFX7-NEXT: v_and_b32_e32 v22, 1, v22 ; GFX7-NEXT: v_and_b32_e32 v21, 1, v21 @@ -44600,7 +44679,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v24 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 @@ -44610,7 +44689,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[14:15] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -44618,7 +44697,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[12:13] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -44626,7 +44705,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[10:11] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 @@ -44634,7 +44713,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[8:9] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 @@ -44642,7 +44721,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[6:7] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 @@ -44650,12 +44729,12 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[4:5] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[16:17] ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -44980,28 +45059,28 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 +; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 +; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 -; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 +; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60 -; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 -; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 +; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[38:39] -; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v26, s[36:37] -; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v28, s[30:31] +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v20, v33, v20, s[38:39] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v21, v24, s[36:37] +; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v28, s[30:31] ; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[90:91] @@ -45010,17 +45089,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[78:79] ; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[76:77] -; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[74:75] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[72:73] +; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[62:63] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[60:61] +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v19 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[58:59] ; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 @@ -45075,21 +45154,21 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v27 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v32 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v20 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v22, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v26, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v12, v31, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readlane_b32 s39, v34, 7 ; GFX8-NEXT: v_readlane_b32 s38, v34, 6 ; GFX8-NEXT: v_readlane_b32 s37, v34, 5 @@ -45182,118 +45261,118 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 +; GFX900-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:12 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:20 +; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 ; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 +; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 +; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 +; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] -; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] -; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] -; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] -; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] -; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] -; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] -; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] -; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] -; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] -; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX900-NEXT: v_cndmask_b32_e64 v15, v14, v16, s[34:35] ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v16, v14, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v13, v17, s[94:95] +; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_cndmask_b32_e64 v17, v13, v17, s[92:93] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v12, v18, s[90:91] +; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v18, v12, v18, s[88:89] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v11, v19, s[78:79] +; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cndmask_b32_e64 v19, v11, v19, s[76:77] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v10, v20, s[74:75] +; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v20, v10, v20, s[72:73] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v9, v21, s[62:63] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[60:61] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v8, v22, s[58:59] +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v22, v8, v22, s[56:57] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v7, v23, s[46:47] +; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_cndmask_b32_e64 v23, v7, v23, s[44:45] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v6, v24, s[42:43] +; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v24, v6, v24, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v5, v25, s[28:29] +; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_cndmask_b32_e64 v25, v5, v25, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v4, v26, s[24:25] +; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v26, v4, v26, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v3, v27, s[20:21] +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v27, v3, v27, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v2, v28, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v28, v2, v28, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v29, v30, s[12:13] +; GFX900-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v30, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v32, v0, v1, s[4:5] ; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4 -; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4 -; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4 -; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4 -; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4 -; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4 -; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4 -; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4 -; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4 -; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4 -; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4 -; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4 -; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4 -; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v32, s4 +; GFX900-NEXT: v_perm_b32 v1, v31, v30, s4 +; GFX900-NEXT: v_perm_b32 v2, v29, v2, s4 +; GFX900-NEXT: v_perm_b32 v3, v28, v3, s4 +; GFX900-NEXT: v_perm_b32 v4, v27, v4, s4 +; GFX900-NEXT: v_perm_b32 v5, v26, v5, s4 +; GFX900-NEXT: v_perm_b32 v6, v25, v6, s4 +; GFX900-NEXT: v_perm_b32 v7, v24, v7, s4 +; GFX900-NEXT: v_perm_b32 v8, v23, v8, s4 +; GFX900-NEXT: v_perm_b32 v9, v22, v9, s4 +; GFX900-NEXT: v_perm_b32 v10, v21, v10, s4 +; GFX900-NEXT: v_perm_b32 v11, v20, v11, s4 +; GFX900-NEXT: v_perm_b32 v12, v19, v12, s4 +; GFX900-NEXT: v_perm_b32 v13, v18, v13, s4 +; GFX900-NEXT: v_perm_b32 v14, v17, v14, s4 +; GFX900-NEXT: v_perm_b32 v15, v16, v15, s4 ; GFX900-NEXT: v_readlane_b32 s35, v33, 3 ; GFX900-NEXT: v_readlane_b32 s34, v33, 2 ; GFX900-NEXT: v_readlane_b32 s31, v33, 1 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index ab2ad19d0f1bf..c7c76fa8d24fe 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s27, s[8:9], 0x2 +; CHECK-NEXT: s_load_dword s28, s[8:9], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] ; CHECK-NEXT: s_add_u32 s96, s96, s15 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_eq_u32 s27, 0 +; CHECK-NEXT: s_cmp_eq_u32 s28, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 53b2542cf9a7e..7c086d97006f8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2484,10 +2484,10 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshrrev_b64 v[18:19], 24, v[0:1] +; SDAG-NEXT: v_lshrrev_b64 v[17:18], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v0 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -2498,10 +2498,10 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 ; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v16 +; SDAG-NEXT: v_mov_b32_e32 v2, v12 ; SDAG-NEXT: v_mov_b32_e32 v12, v3 -; SDAG-NEXT: v_mov_b32_e32 v1, v17 -; SDAG-NEXT: v_mov_b32_e32 v2, v16 -; SDAG-NEXT: v_mov_b32_e32 v3, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v17 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: load_v16i8: @@ -2697,19 +2697,19 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: store_v32i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11 -; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 ; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15 +; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 +; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5 ; SDAG-NEXT: v_lshlrev_b16_e32 v7, 8, v7 ; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2719,25 +2719,25 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; SDAG-NEXT: v_lshlrev_b16_e32 v14, 8, v25 -; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v27 +; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; SDAG-NEXT: v_lshlrev_b16_e32 v25, 8, v27 ; SDAG-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; SDAG-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; SDAG-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; SDAG-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v10, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v11, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; SDAG-NEXT: v_or_b32_sdwa v5, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v4, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v3, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v14 ; SDAG-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16 @@ -2747,65 +2747,65 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: store_v32i8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v31, 8 -; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_mov_b32_e32 v32, 0xff -; GISEL-NEXT: v_and_or_b32 v0, v0, v32, v1 -; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GISEL-NEXT: buffer_load_ubyte v7, off, s[0:3], s32 -; GISEL-NEXT: v_and_or_b32 v1, v4, v32, v1 +; GISEL-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 8 +; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_mov_b32_e32 v33, 0xff ; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GISEL-NEXT: v_and_or_b32 v0, v0, v33, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3 -; GISEL-NEXT: v_or3_b32 v1, v1, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7 +; GISEL-NEXT: v_and_or_b32 v1, v4, v33, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 +; GISEL-NEXT: v_and_or_b32 v2, v8, v33, v4 ; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10 ; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11 -; GISEL-NEXT: v_and_or_b32 v2, v8, v32, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15 -; GISEL-NEXT: v_and_or_b32 v3, v12, v32, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v18 -; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v19 -; GISEL-NEXT: v_and_or_b32 v4, v16, v32, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_or3_b32 v4, v4, v5, v6 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v22 -; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v23 -; GISEL-NEXT: v_and_or_b32 v8, v20, v32, v8 +; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v14 +; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GISEL-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GISEL-NEXT: v_and_or_b32 v3, v12, v33, v3 +; GISEL-NEXT: v_and_or_b32 v4, v16, v33, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GISEL-NEXT: v_or3_b32 v5, v8, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v26 -; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v27 -; GISEL-NEXT: v_and_or_b32 v6, v24, v32, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GISEL-NEXT: v_or3_b32 v4, v4, v7, v8 +; GISEL-NEXT: v_or3_b32 v3, v3, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_sdwa v5, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GISEL-NEXT: v_and_b32_e32 v7, 0xff, v23 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_or_b32 v5, v20, v33, v5 +; GISEL-NEXT: v_or3_b32 v5, v5, v6, v7 +; GISEL-NEXT: v_and_or_b32 v6, v24, v33, v8 +; GISEL-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v27 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GISEL-NEXT: v_or3_b32 v6, v6, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_sdwa v7, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v30 +; GISEL-NEXT: v_and_or_b32 v7, v28, v33, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GISEL-NEXT: v_or3_b32 v6, v6, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v30 -; GISEL-NEXT: v_and_or_b32 v8, v28, v32, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GISEL-NEXT: v_or3_b32 v7, v8, v9, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v31 +; GISEL-NEXT: v_or3_b32 v7, v7, v8, v9 ; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 ; GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index b8dd377377dab..bb5ff45359086 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4268,13 +4268,13 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[8:9] -; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; ; CI-LABEL: test_call_external_void_func_v32i32: @@ -4300,13 +4300,13 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[8:9] -; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: test_call_external_void_func_v32i32: @@ -4332,13 +4332,13 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v32i32: @@ -4386,13 +4386,13 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[12:13] -; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(7) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison %val = load <32 x i32>, ptr addrspace(1) %ptr @@ -4403,26 +4403,26 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-LABEL: test_call_external_void_func_v32i32_i32: ; VI: ; %bb.0: +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] @@ -4430,34 +4430,34 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; ; CI-LABEL: test_call_external_void_func_v32i32_i32: ; CI: ; %bb.0: +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] @@ -4465,34 +4465,34 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: test_call_external_void_func_v32i32_i32: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -4545,8 +4545,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 @@ -4561,9 +4561,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(8) -; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HSA-NEXT: s_waitcnt vmcnt(8) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison @@ -5398,24 +5398,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: s_mov_b32 s54, -1 ; VI-NEXT: s_mov_b32 s55, 0xe80000 ; VI-NEXT: s_add_u32 s52, s52, s5 -; VI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xa4 ; VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_addc_u32 s53, s53, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s23 +; VI-NEXT: v_mov_b32_e32 v0, s19 ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s21 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; VI-NEXT: s_mov_b64 s[2:3], s[54:55] +; VI-NEXT: s_getpc_b64 s[20:21] +; VI-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 ; VI-NEXT: v_mov_b32_e32 v0, s36 ; VI-NEXT: v_mov_b32_e32 v1, s37 ; VI-NEXT: v_mov_b32_e32 v2, s38 @@ -5432,22 +5433,21 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: v_mov_b32_e32 v13, s49 ; VI-NEXT: v_mov_b32_e32 v14, s50 ; VI-NEXT: v_mov_b32_e32 v15, s51 -; VI-NEXT: v_mov_b32_e32 v16, s8 -; VI-NEXT: v_mov_b32_e32 v17, s9 -; VI-NEXT: v_mov_b32_e32 v18, s10 -; VI-NEXT: v_mov_b32_e32 v19, s11 -; VI-NEXT: v_mov_b32_e32 v20, s12 -; VI-NEXT: v_mov_b32_e32 v21, s13 -; VI-NEXT: v_mov_b32_e32 v22, s14 -; VI-NEXT: v_mov_b32_e32 v23, s15 -; VI-NEXT: v_mov_b32_e32 v24, s16 -; VI-NEXT: v_mov_b32_e32 v25, s17 -; VI-NEXT: v_mov_b32_e32 v26, s18 -; VI-NEXT: v_mov_b32_e32 v27, s19 -; VI-NEXT: v_mov_b32_e32 v28, s20 -; VI-NEXT: v_mov_b32_e32 v29, s21 -; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_mov_b32_e32 v16, s4 +; VI-NEXT: v_mov_b32_e32 v17, s5 +; VI-NEXT: v_mov_b32_e32 v20, s8 +; VI-NEXT: s_mov_b64 s[2:3], s[54:55] +; VI-NEXT: v_mov_b32_e32 v21, s9 +; VI-NEXT: v_mov_b32_e32 v22, s10 +; VI-NEXT: v_mov_b32_e32 v23, s11 +; VI-NEXT: v_mov_b32_e32 v24, s12 +; VI-NEXT: v_mov_b32_e32 v25, s13 +; VI-NEXT: v_mov_b32_e32 v26, s14 +; VI-NEXT: v_mov_b32_e32 v27, s15 +; VI-NEXT: v_mov_b32_e32 v28, s16 +; VI-NEXT: v_mov_b32_e32 v29, s17 +; VI-NEXT: v_mov_b32_e32 v30, s18 +; VI-NEXT: s_swappc_b64 s[30:31], s[20:21] ; VI-NEXT: s_endpgm ; ; CI-LABEL: stack_passed_arg_alignment_v32i32_f64: @@ -5457,24 +5457,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: s_mov_b32 s54, -1 ; CI-NEXT: s_mov_b32 s55, 0xe8f000 ; CI-NEXT: s_add_u32 s52, s52, s5 -; CI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 -; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x29 +; CI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; CI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x29 ; CI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_addc_u32 s53, s53, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s23 +; CI-NEXT: v_mov_b32_e32 v0, s19 ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s20 ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 -; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: v_mov_b32_e32 v0, s21 +; CI-NEXT: v_mov_b32_e32 v18, s6 +; CI-NEXT: v_mov_b32_e32 v19, s7 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; CI-NEXT: s_mov_b64 s[2:3], s[54:55] +; CI-NEXT: s_getpc_b64 s[20:21] +; CI-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 ; CI-NEXT: v_mov_b32_e32 v0, s36 ; CI-NEXT: v_mov_b32_e32 v1, s37 ; CI-NEXT: v_mov_b32_e32 v2, s38 @@ -5491,22 +5492,21 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: v_mov_b32_e32 v13, s49 ; CI-NEXT: v_mov_b32_e32 v14, s50 ; CI-NEXT: v_mov_b32_e32 v15, s51 -; CI-NEXT: v_mov_b32_e32 v16, s8 -; CI-NEXT: v_mov_b32_e32 v17, s9 -; CI-NEXT: v_mov_b32_e32 v18, s10 -; CI-NEXT: v_mov_b32_e32 v19, s11 -; CI-NEXT: v_mov_b32_e32 v20, s12 -; CI-NEXT: v_mov_b32_e32 v21, s13 -; CI-NEXT: v_mov_b32_e32 v22, s14 -; CI-NEXT: v_mov_b32_e32 v23, s15 -; CI-NEXT: v_mov_b32_e32 v24, s16 -; CI-NEXT: v_mov_b32_e32 v25, s17 -; CI-NEXT: v_mov_b32_e32 v26, s18 -; CI-NEXT: v_mov_b32_e32 v27, s19 -; CI-NEXT: v_mov_b32_e32 v28, s20 -; CI-NEXT: v_mov_b32_e32 v29, s21 -; CI-NEXT: v_mov_b32_e32 v30, s22 -; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: v_mov_b32_e32 v16, s4 +; CI-NEXT: v_mov_b32_e32 v17, s5 +; CI-NEXT: v_mov_b32_e32 v20, s8 +; CI-NEXT: s_mov_b64 s[2:3], s[54:55] +; CI-NEXT: v_mov_b32_e32 v21, s9 +; CI-NEXT: v_mov_b32_e32 v22, s10 +; CI-NEXT: v_mov_b32_e32 v23, s11 +; CI-NEXT: v_mov_b32_e32 v24, s12 +; CI-NEXT: v_mov_b32_e32 v25, s13 +; CI-NEXT: v_mov_b32_e32 v26, s14 +; CI-NEXT: v_mov_b32_e32 v27, s15 +; CI-NEXT: v_mov_b32_e32 v28, s16 +; CI-NEXT: v_mov_b32_e32 v29, s17 +; CI-NEXT: v_mov_b32_e32 v30, s18 +; CI-NEXT: s_swappc_b64 s[30:31], s[20:21] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: @@ -5516,24 +5516,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: s_mov_b32 s54, -1 ; GFX9-NEXT: s_mov_b32 s55, 0xe00000 ; GFX9-NEXT: s_add_u32 s52, s52, s5 -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xa4 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_addc_u32 s53, s53, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s19 ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-NEXT: v_mov_b32_e32 v19, s7 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, s36 ; GFX9-NEXT: v_mov_b32_e32 v1, s37 ; GFX9-NEXT: v_mov_b32_e32 v2, s38 @@ -5550,22 +5551,21 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: v_mov_b32_e32 v13, s49 ; GFX9-NEXT: v_mov_b32_e32 v14, s50 ; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s16 -; GFX9-NEXT: v_mov_b32_e32 v25, s17 -; GFX9-NEXT: v_mov_b32_e32 v26, s18 -; GFX9-NEXT: v_mov_b32_e32 v27, s19 -; GFX9-NEXT: v_mov_b32_e32 v28, s20 -; GFX9-NEXT: v_mov_b32_e32 v29, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, s8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-NEXT: v_mov_b32_e32 v21, s9 +; GFX9-NEXT: v_mov_b32_e32 v22, s10 +; GFX9-NEXT: v_mov_b32_e32 v23, s11 +; GFX9-NEXT: v_mov_b32_e32 v24, s12 +; GFX9-NEXT: v_mov_b32_e32 v25, s13 +; GFX9-NEXT: v_mov_b32_e32 v26, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s15 +; GFX9-NEXT: v_mov_b32_e32 v28, s16 +; GFX9-NEXT: v_mov_b32_e32 v29, s17 +; GFX9-NEXT: v_mov_b32_e32 v30, s18 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: @@ -5628,7 +5628,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: s_getpc_b64 s[24:25] ; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, s36 ; HSA-NEXT: v_mov_b32_e32 v1, s37 ; HSA-NEXT: v_mov_b32_e32 v2, s38 @@ -5650,6 +5649,7 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: v_mov_b32_e32 v18, s10 ; HSA-NEXT: v_mov_b32_e32 v19, s11 ; HSA-NEXT: v_mov_b32_e32 v20, s12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v21, s13 ; HSA-NEXT: v_mov_b32_e32 v22, s14 ; HSA-NEXT: v_mov_b32_e32 v23, s15 @@ -5672,51 +5672,51 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; VI-NEXT: s_setpc_b64 s[4:5] ; ; CI-LABEL: tail_call_byval_align16: ; CI: ; %bb.0: ; %entry ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; CI-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_setpc_b64 s[4:5] ; ; GFX11-LABEL: tail_call_byval_align16: @@ -5737,17 +5737,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; HSA: ; %bb.0: ; %entry ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; HSA-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: s_waitcnt vmcnt(2) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 -; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; HSA-NEXT: s_waitcnt vmcnt(2) -; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; HSA-NEXT: s_waitcnt vmcnt(1) -; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; HSA-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca double, align 8, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index d05424ffe773d..96d52a2ec0aaa 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -379,8 +379,15 @@ define void @too_many_args_use_workitem_id_x( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s32, 0 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 v31, v0 + +; GFX90A: v_mov_b32_e32 v1, 0x140 +; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GFX90A: v_mov_b32_e32 v31, v0 + +; GFX7: v_mov_b32_e32 v31, v0 +; GFX7: v_mov_b32_e32 v0, 0x140 +; GFX7: buffer_store_dword v0, off, s[0:3], s32{{$}} + ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 0 @@ -572,12 +579,12 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Y]], off{{$}} ; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Z]], off{{$}} -; GFX7: v_and_b32_e32 v32, 0x3ff, v31 -; GFX7: v_bfe_u32 v32, v31, 10, 10 +; GFX7: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} +; GFX7: v_and_b32_e32 v33, 0x3ff, v31 +; GFX7: v_bfe_u32 v33, v31, 10, 10 ; GCN7: v_bfe_u32 v31, v31, 20, 10 -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}} +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}} ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31{{$}} -; GFX7: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] ; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index b671d68a4b75b..7bac9b780b5c8 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -332,8 +332,8 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 ; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -464,10 +464,11 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: +; FIXEDABI: buffer_load_dword v32, off, s[0:3], s32{{$}} + ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 ; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 -; FIXEDABI: buffer_load_dword v31, off, s[0:3], s32{{$}} ; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} ; FIXEDABI: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ddd3b1520bf5e..795df3e317043 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -3401,12 +3401,12 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v17, 3, v28 ; SI-NEXT: v_and_b32_e32 v18, 3, v24 -; SI-NEXT: v_and_b32_e32 v19, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 3, v20 ; SI-NEXT: v_and_b32_e32 v16, 3, v16 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_and_b32_e32 v12, 3, v12 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 @@ -3417,7 +3417,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_and_b32_e32 v0, 3, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v3, v18, v5 -; SI-NEXT: v_or_b32_e32 v5, v19, v9 +; SI-NEXT: v_or_b32_e32 v5, v20, v9 ; SI-NEXT: v_or_b32_e32 v7, v16, v13 ; SI-NEXT: v_or_b32_e32 v9, v12, v14 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 @@ -3450,57 +3450,70 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; ; VI-LABEL: amdgpu_cs_v32i1: ; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v14, 1, v14 +; VI-NEXT: v_lshlrev_b16_e32 v13, 1, v13 +; VI-NEXT: v_and_b32_e32 v12, 1, v12 +; VI-NEXT: v_and_b32_e32 v10, 1, v10 +; VI-NEXT: v_lshlrev_b16_e32 v9, 1, v9 +; VI-NEXT: v_and_b32_e32 v8, 1, v8 ; VI-NEXT: v_and_b32_e32 v6, 1, v6 ; VI-NEXT: v_lshlrev_b16_e32 v5, 1, v5 ; VI-NEXT: v_and_b32_e32 v4, 1, v4 ; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: v_lshlrev_b16_e32 v15, 3, v15 +; VI-NEXT: v_lshlrev_b16_e32 v14, 2, v14 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_lshlrev_b16_e32 v11, 3, v11 +; VI-NEXT: v_lshlrev_b16_e32 v10, 2, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v7, 3, v7 ; VI-NEXT: v_lshlrev_b16_e32 v6, 2, v6 ; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_and_b32_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_and_b32_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 ; VI-NEXT: v_and_b32_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_and_b32_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v12, v12, v14 +; VI-NEXT: v_or_b32_e32 v8, v8, v10 ; VI-NEXT: v_or_b32_e32 v4, v4, v6 +; VI-NEXT: v_mov_b32_e32 v3, 15 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 4, v4 +; VI-NEXT: v_lshlrev_b16_e32 v1, 12, v12 +; VI-NEXT: v_and_b32_sdwa v5, v8, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v4, 4, v4 ; VI-NEXT: v_and_b32_e32 v0, 15, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 ; VI-NEXT: v_and_b32_e32 v2, 1, v30 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 3, v31 ; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v2, 1, v29 -; VI-NEXT: v_and_b32_e32 v3, 1, v28 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_and_b32_e32 v4, 1, v28 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_and_b32_e32 v2, 3, v2 -; VI-NEXT: v_and_b32_e32 v3, 1, v26 +; VI-NEXT: v_and_b32_e32 v4, 1, v26 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_lshlrev_b16_e32 v2, 3, v27 -; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 -; VI-NEXT: v_and_b32_e32 v10, 1, v10 -; VI-NEXT: v_lshlrev_b16_e32 v9, 1, v9 -; VI-NEXT: v_and_b32_e32 v8, 1, v8 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_lshlrev_b16_e32 v3, 1, v25 -; VI-NEXT: v_and_b32_e32 v4, 1, v24 -; VI-NEXT: v_lshlrev_b16_e32 v11, 3, v11 -; VI-NEXT: v_lshlrev_b16_e32 v10, 2, v10 -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_and_b32_e32 v8, 3, v8 -; VI-NEXT: v_and_b32_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_e32 v8, v8, v10 -; VI-NEXT: v_mov_b32_e32 v10, 15 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v25 +; VI-NEXT: v_and_b32_e32 v5, 1, v24 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_and_b32_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 12, v1 -; VI-NEXT: v_and_b32_sdwa v2, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_and_b32_e32 v3, 1, v22 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v2, 3, v23 @@ -3514,28 +3527,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v19 ; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 -; VI-NEXT: v_and_b32_e32 v14, 1, v14 -; VI-NEXT: v_lshlrev_b16_e32 v13, 1, v13 -; VI-NEXT: v_and_b32_e32 v12, 1, v12 ; VI-NEXT: v_or_b32_e32 v3, v3, v4 ; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v17 ; VI-NEXT: v_and_b32_e32 v5, 1, v16 -; VI-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; VI-NEXT: v_lshlrev_b16_e32 v14, 2, v14 -; VI-NEXT: v_or_b32_e32 v12, v12, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_and_b32_e32 v12, 3, v12 ; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_e32 v12, v12, v14 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_lshlrev_b16_e32 v9, 12, v12 -; VI-NEXT: v_and_b32_sdwa v8, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 4, v2 ; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index cefcbddd3e394..14eee1e362500 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -721,7 +721,6 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -730,6 +729,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 @@ -789,12 +789,12 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 49ba0e2ac796a..0ae5e73adb44f 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -69,20 +69,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v18, v16 +; GFX9-NEXT: v_or_b32_e32 v15, v3, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v18, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2 -; GFX9-NEXT: v_or_b32_e32 v7, v3, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 +; GFX9-NEXT: v_or_b32_e32 v14, v6, v4 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5] @@ -152,38 +152,38 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GFX9-NEXT: v_or_b32_e32 v4, v14, v30 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v4, v14, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v5, v15, v31 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v14 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 -; GFX9-NEXT: v_and_b32_e32 v6, v30, v0 +; GFX9-NEXT: v_and_b32_e32 v14, v30, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc ; GFX9-NEXT: v_and_b32_e32 v14, v30, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc +; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 @@ -1154,13 +1154,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc ; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 ; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 @@ -1172,8 +1172,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10 -; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11 +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1185,15 +1185,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 ; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 ; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] @@ -1210,7 +1210,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 @@ -1218,58 +1217,59 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 ; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-G-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 ; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[6:7], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[12:13], s[6:7] ; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 ; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11] -; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9] +; GFX9-G-NEXT: s_xor_b64 s[8:9], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-G-NEXT: s_and_saveexec_b64 s[4:5], s[8:9] +; GFX9-G-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1279,75 +1279,75 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 ; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18 -; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc ; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 -; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5 ; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v3, vcc +; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 -; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow ; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: .LBB0_6: ; %Flow3 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -2358,43 +2358,43 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v25, v3, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v16 -; GFX9-NEXT: v_and_b32_e32 v16, v26, v4 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 31, v11 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 +; GFX9-NEXT: v_and_b32_e32 v16, v12, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16 -; GFX9-NEXT: v_and_b32_e32 v16, v26, v5 +; GFX9-NEXT: v_and_b32_e32 v16, v12, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v16, v26, v6 +; GFX9-NEXT: v_and_b32_e32 v16, v12, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v16, v26, v7 +; GFX9-NEXT: v_and_b32_e32 v16, v12, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-NEXT: v_or_b32_e32 v11, v17, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or3_b32 v8, v8, v26, v14 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v16, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow @@ -3365,41 +3365,41 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v26, 31, v15 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v27, 31, v9 ; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 ; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[16:17] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v10, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v11, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v12, 31, v12 -; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v4 -; GFX9-G-NEXT: v_and_b32_e32 v16, v12, v5 -; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v13 -; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc -; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v6 -; GFX9-G-NEXT: v_and_b32_e32 v17, v12, v7 -; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v10, v13, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v17, vcc -; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 -; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc -; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc -; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 -; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v23, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v24, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v11, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v10, 31, v10 +; GFX9-G-NEXT: v_and_b32_e32 v12, v10, v4 +; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 +; GFX9-G-NEXT: v_and_b32_e32 v13, v10, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v12 +; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v13, vcc +; GFX9-G-NEXT: v_and_b32_e32 v12, v10, v6 +; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v13, v10, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v0, v12, vcc +; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v12, v18, v20 +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v13, vcc +; GFX9-G-NEXT: v_or_b32_e32 v13, v19, v21 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 -; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v10 ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v26 +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 7ea98a16e3b84..e71d4a89f7bfe 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -7,56 +7,56 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 -; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v2, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v17 +; SDAG-NEXT: v_or_b32_e32 v0, v16, v18 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v3, v1, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v3, v18 +; SDAG-NEXT: v_or_b32_e32 v1, v17, v19 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v21 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 32, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v19 +; SDAG-NEXT: v_subb_u32_e32 v30, vcc, 0, v10, vcc ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v22, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v1, v3, v21 ; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 ; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v30, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] +; SDAG-NEXT: v_ffbh_u32_e32 v21, v28 ; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 ; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v21 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 +; SDAG-NEXT: v_min_u32_e32 v2, v11, v21 ; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 ; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] @@ -65,361 +65,361 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v2 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v20, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v20, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_or_b32_e32 v11, v3, v9 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v21, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v19, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v18, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v17, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v2 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc -; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v30, v32 ; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2 -; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34 +; SDAG-NEXT: v_or_b32_e32 v9, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35 -; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v35 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v9 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v21, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 -; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 -; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5] -; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30 -; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc +; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v30 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[18:19], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[18:19], v35 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[18:19], v36 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v17, v11, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v10, v16, vcc ; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 -; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 -; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v20, v8, v29 -; SDAG-NEXT: v_and_b32_e32 v22, v8, v28 -; SDAG-NEXT: v_and_b32_e32 v38, v8, v0 -; SDAG-NEXT: v_and_b32_e32 v39, v8, v1 -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc -; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v38 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v39 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v16 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v17, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v19, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v22, v10, v29 +; SDAG-NEXT: v_and_b32_e32 v23, v10, v28 +; SDAG-NEXT: v_and_b32_e32 v38, v10, v0 +; SDAG-NEXT: v_and_b32_e32 v39, v10, v1 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v22 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v18, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v19, v39, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v17, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_or_b32_e32 v3, v11, v3 -; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v2, v10, v2 -; SDAG-NEXT: v_mov_b32_e32 v17, v9 -; SDAG-NEXT: v_mov_b32_e32 v16, v8 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v22, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v23, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_or_b32_e32 v3, v9, v3 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_mov_b32_e32 v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v22, v10 +; SDAG-NEXT: v_or_b32_e32 v2, v8, v2 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 -; SDAG-NEXT: v_or_b32_e32 v19, v9, v3 -; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 -; SDAG-NEXT: v_or_b32_e32 v23, v8, v2 +; SDAG-NEXT: v_or_b32_e32 v20, v9, v1 +; SDAG-NEXT: v_or_b32_e32 v21, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v22, v8, v0 +; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v20, v16 -; SDAG-NEXT: v_mov_b32_e32 v21, v17 +; SDAG-NEXT: v_mov_b32_e32 v18, v16 +; SDAG-NEXT: v_mov_b32_e32 v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v0, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v8 +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 32, v1 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 -; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v3, v9 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v9 +; SDAG-NEXT: v_min_u32_e32 v4, v7, v4 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 +; SDAG-NEXT: v_min_u32_e32 v1, v6, v30 ; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 ; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v7, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v7, v29 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v6, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_i32_e32 v7, vcc, 32, v7 ; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 -; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 +; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v6 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 +; SDAG-NEXT: v_min_u32_e32 v7, v7, v11 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 -; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v4, v6, v14 +; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v7 +; SDAG-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v11, 0x7f, v4 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v10, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v11, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_or_b32_e32 v11, v5, v7 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 ; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 -; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v6, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v7, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0x7f, v4 +; SDAG-NEXT: v_or_b32_e32 v6, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[8:9], v7 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, 64, v7 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v7 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[5:6] +; SDAG-NEXT: v_lshr_b64 v[4:5], v[2:3], v4 +; SDAG-NEXT: v_or_b32_e32 v5, v15, v5 +; SDAG-NEXT: v_or_b32_e32 v4, v14, v4 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v6, v13, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v35, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v34, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v8, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 -; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v35 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v36 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v9, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v5 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 -; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 -; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 -; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 -; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 -; SDAG-NEXT: v_and_b32_e32 v39, v8, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 +; SDAG-NEXT: v_or_b32_e32 v5, v15, v5 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v39 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2 +; SDAG-NEXT: v_or_b32_e32 v6, v12, v6 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v9, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v15, v10, v29 +; SDAG-NEXT: v_and_b32_e32 v38, v10, v0 +; SDAG-NEXT: v_and_b32_e32 v39, v10, v28 +; SDAG-NEXT: v_and_b32_e32 v48, v10, v1 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v8, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v14, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_or_b32_e32 v7, v13, v7 +; SDAG-NEXT: v_or_b32_e32 v4, v14, v4 +; SDAG-NEXT: v_mov_b32_e32 v15, v11 +; SDAG-NEXT: v_mov_b32_e32 v14, v10 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[4:5], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v9, v3 -; SDAG-NEXT: v_or_b32_e32 v9, v12, v0 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 -; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 ; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 +; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 -; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 +; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 ; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6 +; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: v_sdiv_v2i128_vv: @@ -427,43 +427,43 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v21, 0 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 -; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 -; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 -; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 -; GISEL-NEXT: v_xor_b32_e32 v3, v24, v3 ; GISEL-NEXT: v_xor_b32_e32 v8, v25, v8 ; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9 ; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10 ; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v24 -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v24, vcc -; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25 -; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v24, vcc -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v24, vcc -; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 +; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 +; GISEL-NEXT: v_xor_b32_e32 v3, v24, v3 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v8, v25 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v9, v25, vcc +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v24 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v1, v24, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v25, vcc ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v8, v27 +; GISEL-NEXT: v_subb_u32_e64 v18, vcc, v2, v24, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v9, v26 -; GISEL-NEXT: v_ffbh_u32_e32 v22, v18 -; GISEL-NEXT: v_ffbh_u32_e32 v23, v19 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 ; GISEL-NEXT: v_or_b32_e32 v0, v26, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v27, v11 -; GISEL-NEXT: v_or_b32_e32 v2, v18, v20 -; GISEL-NEXT: v_or_b32_e32 v3, v19, v21 +; GISEL-NEXT: v_or_b32_e32 v2, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v3, v17, v19 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9 -; GISEL-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 ; GISEL-NEXT: v_ffbh_u32_e32 v28, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v29, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v30, v20 -; GISEL-NEXT: v_ffbh_u32_e32 v31, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v18 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v19 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; GISEL-NEXT: v_min_u32_e32 v0, v8, v9 -; GISEL-NEXT: v_min_u32_e32 v1, v23, v22 +; GISEL-NEXT: v_min_u32_e32 v1, v22, v23 ; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v28 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v30 ; GISEL-NEXT: v_min_u32_e32 v2, v29, v2 @@ -474,35 +474,35 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v9, v3, v1 ; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v9, v22, v16 +; GISEL-NEXT: v_or_b32_e32 v9, v22, v20 ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_and_b32_e32 v9, 1, v9 ; GISEL-NEXT: v_and_b32_e32 v8, 1, v8 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc ; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2 @@ -511,111 +511,111 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2 +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2 ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16 +; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[20:21], v28 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[18:19], v28 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22 -; GISEL-NEXT: v_or_b32_e32 v22, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v23, v3, v23 -; GISEL-NEXT: s_mov_b64 s[8:9], 0 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v19, v3, v19, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[18:19], v34 ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc -; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GISEL-NEXT: v_mov_b32_e32 v23, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, s8 -; GISEL-NEXT: v_mov_b32_e32 v1, s9 -; GISEL-NEXT: v_mov_b32_e32 v2, s10 -; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v22, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, v3, v17, vcc +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 -; GISEL-NEXT: v_or_b32_e32 v16, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v17, v1, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v19 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v32, v0 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v1, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v34, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v35, v3, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v22, 31, v18 -; GISEL-NEXT: v_and_b32_e32 v18, v22, v26 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v18 -; GISEL-NEXT: v_and_b32_e32 v0, v22, v27 -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v0, vcc -; GISEL-NEXT: v_and_b32_e32 v0, v22, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v0, vcc -; GISEL-NEXT: v_and_b32_e32 v0, v22, v11 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v0, vcc +; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v9 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v16 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v20 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 ; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v22 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v23, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 ; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v34, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v35, v19, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v22 -; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 +; GISEL-NEXT: v_and_b32_e32 v2, v0, v27 +; GISEL-NEXT: v_and_b32_e32 v3, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v37, v0, v11 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v22, v1 +; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v23, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v18, v3, vcc +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v37, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v8, v36 -; GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB0_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v17 +; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 -; GISEL-NEXT: v_or_b32_e32 v22, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v23, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 ; GISEL-NEXT: .LBB0_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 @@ -631,18 +631,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc -; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19 -; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc ; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc ; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v21 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v20 +; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 ; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 ; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 -; GISEL-NEXT: v_or_b32_e32 v0, v20, v4 -; GISEL-NEXT: v_or_b32_e32 v1, v21, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 ; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 ; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 @@ -733,8 +733,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20 -; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc ; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 ; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc @@ -783,8 +783,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v20 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v21 +; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 ; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, v6 @@ -809,8 +809,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 ; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 -; GISEL-NEXT: v_xor_b32_e32 v0, v22, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v23, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 +; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3 ; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 @@ -835,57 +835,57 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 -; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 -; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 ; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v26 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 -; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 -; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v18, v20, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v22, v27 +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 ; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v22, v20, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v22, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v20 ; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v21, v25 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] @@ -895,24 +895,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v20 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v20 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v21, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v26, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20 ; SDAG-NEXT: v_or_b32_e32 v19, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5] @@ -921,8 +921,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_5 @@ -947,12 +947,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_mov_b32_e32 v24, 0 ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 @@ -960,23 +960,23 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 ; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v24, v20, v8 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v22 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v19 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v22 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v22, 31, v22 +; SDAG-NEXT: v_and_b32_e32 v24, v22, v8 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24 -; SDAG-NEXT: v_and_b32_e32 v24, v20, v9 +; SDAG-NEXT: v_and_b32_e32 v24, v22, v9 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc -; SDAG-NEXT: v_and_b32_e32 v24, v20, v10 +; SDAG-NEXT: v_and_b32_e32 v24, v22, v10 ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v24, vcc -; SDAG-NEXT: v_and_b32_e32 v24, v20, v11 +; SDAG-NEXT: v_and_b32_e32 v24, v22, v11 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc @@ -985,14 +985,14 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v24, v26, v28 ; SDAG-NEXT: v_or_b32_e32 v25, v27, v29 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] -; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 +; SDAG-NEXT: v_and_b32_e32 v22, 1, v22 ; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_or_b32_e32 v18, v18, v34 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 ; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 +; SDAG-NEXT: v_mov_b32_e32 v25, v23 +; SDAG-NEXT: v_mov_b32_e32 v24, v22 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 @@ -1003,10 +1003,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v21, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v22, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v20, v2 +; SDAG-NEXT: v_or_b32_e32 v16, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v23, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v20, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v22, v2 ; SDAG-NEXT: .LBB1_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1046,20 +1046,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc -; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v24, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v1, v21 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v1, v3 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1070,118 +1070,118 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 -; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0 -; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc -; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 -; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc -; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v22, v24 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 -; SDAG-NEXT: v_or_b32_e32 v11, v23, v25 -; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27 -; SDAG-NEXT: v_or_b32_e32 v1, v1, v11 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v10 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v8, v24 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0 +; SDAG-NEXT: v_or_b32_e32 v2, v11, v25 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2] +; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[2:3], v[4:5], v22 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v22 -; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v22 -; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 +; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 ; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v3, v32 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v32 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v20 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v30 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v2, v9, v2 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v31, v30, v15 +; SDAG-NEXT: v_and_b32_e32 v20, v30, v12 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v2 -; SDAG-NEXT: v_or_b32_e32 v2, v4, v30 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v31 -; SDAG-NEXT: v_or_b32_e32 v1, v21, v1 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v26, v2 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v27, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v28, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v29, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v4 -; SDAG-NEXT: v_and_b32_e32 v31, v30, v13 -; SDAG-NEXT: v_and_b32_e32 v4, v30, v12 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v2, v4 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v31, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 -; SDAG-NEXT: v_or_b32_e32 v0, v20, v0 -; SDAG-NEXT: v_and_b32_e32 v2, 1, v30 -; SDAG-NEXT: v_and_b32_e32 v11, v30, v15 -; SDAG-NEXT: v_and_b32_e32 v30, v30, v14 -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v30, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc -; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22 -; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v20 +; SDAG-NEXT: v_and_b32_e32 v20, v30, v13 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v20, vcc +; SDAG-NEXT: v_and_b32_e32 v20, 1, v30 +; SDAG-NEXT: v_and_b32_e32 v23, v30, v14 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v23, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v10, v3 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v31, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc ; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v31, v23, v25 -; SDAG-NEXT: v_or_b32_e32 v30, v22, v24 +; SDAG-NEXT: v_or_b32_e32 v31, v11, v25 +; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 -; SDAG-NEXT: v_mov_b32_e32 v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v10, v2 +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_mov_b32_e32 v23, v21 +; SDAG-NEXT: v_mov_b32_e32 v22, v20 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v21, v1 -; SDAG-NEXT: v_or_b32_e32 v10, v3, v5 -; SDAG-NEXT: v_or_b32_e32 v9, v20, v0 -; SDAG-NEXT: v_or_b32_e32 v11, v2, v4 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v4 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v3 +; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 ; SDAG-NEXT: .LBB1_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v19 @@ -1199,31 +1199,31 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v16, v2 ; GISEL-NEXT: v_mov_b32_e32 v17, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 -; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 -; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 -; GISEL-NEXT: v_or_b32_e32 v19, v1, v17 ; GISEL-NEXT: v_ffbh_u32_e32 v20, v9 ; GISEL-NEXT: v_ffbh_u32_e32 v21, v8 ; GISEL-NEXT: v_ffbh_u32_e32 v22, v11 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v16 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v16 +; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v17 ; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v25, 0 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_add_i32_e32 v27, vcc, 32, v27 +; GISEL-NEXT: v_add_i32_e32 v29, vcc, 32, v29 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21 -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23 -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 -; GISEL-NEXT: v_min_u32_e32 v2, v20, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v22, v3 -; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 +; GISEL-NEXT: v_min_u32_e32 v2, v20, v21 +; GISEL-NEXT: v_min_u32_e32 v3, v22, v23 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v27 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v29 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2 @@ -1234,9 +1234,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v2, v3 ; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 ; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v20 @@ -1308,13 +1308,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 ; GISEL-NEXT: v_or_b32_e32 v21, v21, v25 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v19, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v24, v20, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, v21, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v21, s7 ; GISEL-NEXT: v_mov_b32_e32 v20, s6 @@ -1324,37 +1324,37 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23 ; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v25 -; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_or_b32_e32 v22, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v23, v19, v21 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v17 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v35, 31, v3 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 +; GISEL-NEXT: v_or_b32_e32 v0, v18, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v3 +; GISEL-NEXT: v_or_b32_e32 v16, v16, v18 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v30, v16 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v31, v17, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v32, v0, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v19, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v18 +; GISEL-NEXT: v_and_b32_e32 v18, v20, v8 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_and_b32_e32 v18, v20, v9 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v18, vcc +; GISEL-NEXT: v_and_b32_e32 v18, v20, v10 +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v0, v18, vcc +; GISEL-NEXT: v_and_b32_e32 v0, v20, v11 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v19, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc -; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GISEL-NEXT: v_or_b32_e32 v22, v18, v20 -; GISEL-NEXT: v_or_b32_e32 v23, v19, v21 -; GISEL-NEXT: v_or_b32_e32 v16, v16, v0 -; GISEL-NEXT: v_or_b32_e32 v20, v24, v35 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v20 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v25, vcc ; GISEL-NEXT: v_or_b32_e32 v18, v26, v28 ; GISEL-NEXT: v_or_b32_e32 v19, v27, v29 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v20 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v18, v0, v8 -; GISEL-NEXT: v_and_b32_e32 v19, v0, v9 -; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 -; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 ; GISEL-NEXT: v_mov_b32_e32 v19, v1 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 @@ -1434,29 +1434,29 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; GISEL-NEXT: v_not_b32_e32 v9, 63 -; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v24, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v17, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v16 +; GISEL-NEXT: v_not_b32_e32 v8, 63 +; GISEL-NEXT: v_addc_u32_e64 v16, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9 -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26 -; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v22, v8 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v22 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v22 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[6:7], v22 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v20, v16 -; GISEL-NEXT: v_or_b32_e32 v1, v21, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[4:5], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v10, v8 +; GISEL-NEXT: v_or_b32_e32 v9, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v20, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v23, s11 ; GISEL-NEXT: v_mov_b32_e32 v22, s10 @@ -1466,10 +1466,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8 -; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 -; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8 -; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v24 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v24 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v24 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v24 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc @@ -1480,12 +1480,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 ; GISEL-NEXT: v_or_b32_e32 v21, v21, v23 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc ; GISEL-NEXT: v_mov_b32_e32 v5, 0 @@ -1496,50 +1496,50 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: .LBB1_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1 -; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1 -; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc -; GISEL-NEXT: v_or_b32_e32 v16, v16, v4 +; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v9 +; GISEL-NEXT: v_lshl_b64 v[6:7], v[0:1], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 ; GISEL-NEXT: v_or_b32_e32 v22, v22, v30 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 -; GISEL-NEXT: v_or_b32_e32 v9, v20, v6 -; GISEL-NEXT: v_or_b32_e32 v10, v21, v7 -; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v20, v6 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v26, v22 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v27, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v28, v10, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v29, v11, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v30, 31, v1 +; GISEL-NEXT: v_and_b32_e32 v32, v30, v12 +; GISEL-NEXT: v_add_i32_e32 v24, vcc, -1, v24 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, -1, v17, vcc +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, -1, v16, vcc ; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc -; GISEL-NEXT: v_or_b32_e32 v6, v8, v24 -; GISEL-NEXT: v_or_b32_e32 v7, v11, v25 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v4, 1, v6 -; GISEL-NEXT: v_and_b32_e32 v7, v6, v12 -; GISEL-NEXT: v_and_b32_e32 v30, v6, v13 -; GISEL-NEXT: v_and_b32_e32 v31, v6, v14 -; GISEL-NEXT: v_and_b32_e32 v32, v6, v15 +; GISEL-NEXT: v_and_b32_e32 v4, 1, v30 +; GISEL-NEXT: v_or_b32_e32 v6, v24, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v7 ; GISEL-NEXT: v_mov_b32_e32 v21, v5 ; GISEL-NEXT: v_mov_b32_e32 v20, v4 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc +; GISEL-NEXT: v_or_b32_e32 v7, v17, v25 +; GISEL-NEXT: v_and_b32_e32 v4, v30, v13 +; GISEL-NEXT: v_and_b32_e32 v33, v30, v14 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v30, v30, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v32 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v31 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v33, vcc +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v30, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB1_9 ; GISEL-NEXT: ; %bb.10: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB1_11: ; %Flow11 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1 -; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10 +; GISEL-NEXT: v_lshl_b64 v[4:5], v[0:1], 1 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v10, v20, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v21, v5 @@ -1576,48 +1576,48 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 ; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v1 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 ; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, 0, v10, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 -; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v3, s[8:9], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_min_u32_e32 v9, v20, v24 +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v22, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, 0, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v10, v31 +; SDAG-NEXT: v_cndmask_b32_e32 v21, v3, v9, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v22, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 -; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 +; SDAG-NEXT: v_min_u32_e32 v10, v10, v22 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v8, v11, v18 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v10, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v21 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] @@ -1666,10 +1666,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v11, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v10, v0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 @@ -1679,8 +1679,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 @@ -1707,13 +1707,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_or_b32_e32 v22, v26, v48 ; SDAG-NEXT: v_or_b32_e32 v23, v24, v49 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v8 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23 ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc @@ -1735,24 +1735,24 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v22, v32, v34 ; SDAG-NEXT: v_or_b32_e32 v23, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] -; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v19, v11, v19 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 ; SDAG-NEXT: v_mov_b32_e32 v23, v9 ; SDAG-NEXT: v_mov_b32_e32 v22, v8 +; SDAG-NEXT: v_or_b32_e32 v18, v10, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v34, v19, v11 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v22 +; SDAG-NEXT: v_or_b32_e32 v34, v11, v19 ; SDAG-NEXT: v_or_b32_e32 v27, v9, v21 +; SDAG-NEXT: v_or_b32_e32 v32, v10, v18 ; SDAG-NEXT: v_or_b32_e32 v33, v8, v20 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -2028,43 +2028,43 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v20, 0 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 ; GISEL-NEXT: v_xor_b32_e32 v8, v8, v18 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v18 ; GISEL-NEXT: v_xor_b32_e32 v10, v10, v18 ; GISEL-NEXT: v_xor_b32_e32 v11, v11, v18 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc -; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v18 -; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v18, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc -; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v18, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, v8, v18 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v9, v18, vcc +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v28 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v1, v28, s[4:5] +; GISEL-NEXT: v_ffbh_u32_e32 v21, v29 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v30 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v18, vcc ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v18, v29 -; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 -; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_subb_u32_e64 v8, vcc, v2, v28, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v18, v17 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 ; GISEL-NEXT: v_or_b32_e32 v0, v30, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v29, v11 ; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 ; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 ; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v24, v10 -; GISEL-NEXT: v_ffbh_u32_e32 v25, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v8 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v9 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v18, v21 -; GISEL-NEXT: v_min_u32_e32 v1, v22, v23 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v24 +; GISEL-NEXT: v_min_u32_e32 v0, v21, v22 +; GISEL-NEXT: v_min_u32_e32 v1, v18, v23 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v25 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v26 -; GISEL-NEXT: v_min_u32_e32 v2, v25, v2 +; GISEL-NEXT: v_min_u32_e32 v2, v24, v2 ; GISEL-NEXT: v_min_u32_e32 v3, v27, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1 @@ -2167,9 +2167,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v27 +; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19 ; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 ; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19 ; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 @@ -2195,11 +2195,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v24, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc ; GISEL-NEXT: v_or_b32_e32 v18, v18, v39 -; GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB2_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 @@ -2355,42 +2355,42 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB2_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v15 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15 -; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36 ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v3, v52, v25 -; GISEL-NEXT: v_or_b32_e32 v14, v14, v22 +; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GISEL-NEXT: v_or_b32_e32 v26, v26, v25 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v21 ; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc ; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v53, vcc +; GISEL-NEXT: v_or_b32_e32 v14, v14, v24 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v2 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v3, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v36, v38 ; GISEL-NEXT: v_or_b32_e32 v1, v37, v39 -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v26, vcc ; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v1, v0, v35 +; GISEL-NEXT: v_and_b32_e32 v24, v0, v35 ; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 -; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 -; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v52, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v53, v0, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v2, v24 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v3, v25, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v26, v52, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v53, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB2_9 ; GISEL-NEXT: ; %bb.10: ; %Flow @@ -2465,30 +2465,30 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 -; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 -; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 ; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v26 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 -; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 -; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v18, v20, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v22, v27 +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 @@ -2503,19 +2503,19 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v22 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] @@ -2620,9 +2620,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] ; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 ; SDAG-NEXT: v_mov_b32_e32 v25, v17 ; SDAG-NEXT: v_mov_b32_e32 v24, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 @@ -2676,63 +2676,63 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc +; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_or_b32_e32 v21, v17, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_and_b32_e32 v20, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 -; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v34, v36 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v21, v35, v37 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v6, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] +; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 ; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 @@ -2745,120 +2745,120 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 ; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v29 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v28 ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v50, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v49, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v28, v28, v20 ; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v51 -; SDAG-NEXT: v_or_b32_e32 v17, v23, v17 -; SDAG-NEXT: v_or_b32_e32 v21, v25, v21 -; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26 -; SDAG-NEXT: v_or_b32_e32 v16, v22, v16 -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v18, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25 -; SDAG-NEXT: v_and_b32_e32 v28, v25, v12 -; SDAG-NEXT: v_and_b32_e32 v50, v25, v13 -; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 -; SDAG-NEXT: v_and_b32_e32 v52, v25, v15 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v51, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v25, v20, v12 +; SDAG-NEXT: v_and_b32_e32 v50, v20, v14 +; SDAG-NEXT: v_and_b32_e32 v51, v20, v13 +; SDAG-NEXT: v_and_b32_e32 v52, v20, v15 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v50, vcc ; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc ; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v25 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 -; SDAG-NEXT: v_mov_b32_e32 v25, v19 -; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_mov_b32_e32 v25, v21 +; SDAG-NEXT: v_mov_b32_e32 v24, v20 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v23, v17 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v21 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 +; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 ; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 ; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14 ; SDAG-NEXT: v_mul_lo_u32 v36, v23, v12 ; SDAG-NEXT: v_mul_lo_u32 v37, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0 -; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; SDAG-NEXT: v_mov_b32_e32 v20, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; SDAG-NEXT: v_mov_b32_e32 v18, v11 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19] ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 ; SDAG-NEXT: v_add_i32_e64 v23, s[4:5], v25, v34 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 -; SDAG-NEXT: v_mov_b32_e32 v20, v26 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] +; SDAG-NEXT: v_mov_b32_e32 v18, v26 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v23, v35 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v8, v[16:17] ; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 ; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v18, v[20:21] +; SDAG-NEXT: v_mov_b32_e32 v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v20, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v29, v17 ; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v22 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v19, v[20:21] +; SDAG-NEXT: v_mov_b32_e32 v18, v22 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v21, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v33, v15 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v37, v17 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v37, v17 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v12 ; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16 ; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v15, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v19, v[17:18] +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v21, v[17:18] ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v19, vcc ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc @@ -2868,31 +2868,31 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_urem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 -; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 -; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 ; GISEL-NEXT: v_ffbh_u32_e32 v22, v9 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v8 ; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 ; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v2 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v2 +; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 ; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v21, 0 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_add_i32_e32 v25, vcc, 32, v25 +; GISEL-NEXT: v_add_i32_e32 v27, vcc, 32, v27 +; GISEL-NEXT: v_add_i32_e32 v29, vcc, 32, v29 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25 -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 -; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 -; GISEL-NEXT: v_min_u32_e32 v17, v24, v17 -; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 +; GISEL-NEXT: v_min_u32_e32 v16, v22, v23 +; GISEL-NEXT: v_min_u32_e32 v17, v24, v25 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v27 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v29 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 @@ -2903,9 +2903,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v17 ; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18 ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v20, v23, v16 @@ -2994,9 +2994,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v29 +; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 ; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 @@ -3022,11 +3022,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 ; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v26, vcc +; GISEL-NEXT: v_mov_b32_e32 v16, v24 +; GISEL-NEXT: v_mov_b32_e32 v17, v25 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc ; GISEL-NEXT: v_or_b32_e32 v20, v20, v38 -; GISEL-NEXT: v_mov_b32_e32 v16, v24 -; GISEL-NEXT: v_mov_b32_e32 v17, v25 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB3_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 @@ -3164,42 +3164,42 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v16, s4 ; GISEL-NEXT: .LBB3_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25 -; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[28:29], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v23 +; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v25 ; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29 -; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v29 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; GISEL-NEXT: v_or_b32_e32 v24, v16, v18 -; GISEL-NEXT: v_or_b32_e32 v25, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v30, v28 -; GISEL-NEXT: v_or_b32_e32 v19, v50, v29 -; GISEL-NEXT: v_or_b32_e32 v22, v22, v26 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: v_or_b32_e32 v30, v30, v29 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v26 +; GISEL-NEXT: v_or_b32_e32 v24, v16, v24 +; GISEL-NEXT: v_or_b32_e32 v25, v17, v25 ; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v19 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v51, vcc +; GISEL-NEXT: v_or_b32_e32 v22, v22, v28 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v18 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v19, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v34, v36 ; GISEL-NEXT: v_or_b32_e32 v17, v35, v37 -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v30, vcc ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 -; GISEL-NEXT: v_and_b32_e32 v17, v16, v12 +; GISEL-NEXT: v_and_b32_e32 v28, v16, v12 ; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 -; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 -; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 -; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 -; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc +; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 +; GISEL-NEXT: v_and_b32_e32 v50, v16, v14 +; GISEL-NEXT: v_and_b32_e32 v51, v16, v15 ; GISEL-NEXT: v_mov_b32_e32 v16, v26 ; GISEL-NEXT: v_mov_b32_e32 v17, v27 -; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc -; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v18, v28 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v19, v29, vcc +; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v30, v50, vcc +; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v51, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB3_9 ; GISEL-NEXT: ; %bb.10: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 1c687734731b1..4c5d495beb1ac 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -98,10 +98,10 @@ exit: define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_2xi64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB1_2 @@ -110,11 +110,11 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -127,20 +127,20 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[5:6] ; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7] +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[7:8] ; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 @@ -166,10 +166,10 @@ exit: define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_4xi64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB2_2 @@ -178,11 +178,11 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -195,24 +195,24 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB2_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[5:6] ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[7:8] ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[9:10] ; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[11:12] ; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 @@ -240,10 +240,10 @@ exit: define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_8xi64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB3_2 @@ -260,13 +260,13 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB3_2: ; %Flow @@ -285,33 +285,33 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB3_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] -; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13] -; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15] -; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17] -; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19] -; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[5:6] +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[7:8] ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, s[14:15] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[9:10] +; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[11:12] +; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[15:16] +; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[17:18] +; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[13:14] +; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[4:5] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[19:20] +; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 ; GCN-NEXT: v_mov_b32_e32 v5, -1 @@ -342,10 +342,10 @@ exit: define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_2xf64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB4_2 @@ -354,11 +354,11 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -371,20 +371,20 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB4_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 -; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5] +; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[5:6] ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc -; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[6:7] +; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[7:8] ; GCN-NEXT: v_cndmask_b32_e64 v3, v0, -2.0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -410,10 +410,10 @@ exit: define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_4xf64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB5_2 @@ -422,11 +422,11 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -439,24 +439,24 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 -; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[5:6] ; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc -; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[7:8] ; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc -; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[9:10] ; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc -; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[11:12] ; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -484,10 +484,10 @@ exit: define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { ; GCN-LABEL: extract_8xf64: ; GCN: ; %bb.0: +; GCN-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB6_2 @@ -504,13 +504,13 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[2:3], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[2:3], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB6_2: ; %Flow @@ -529,33 +529,33 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[0:1], s[8:11], 0 addr64 offset:16 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[0:1], s[8:11], 0 addr64 offset:32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB6_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 -; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v1, -2.0, v0, s[16:17] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[5:6] +; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[7:8] ; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, -2.0, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v7, -2.0, v0, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v11, -2.0, v0, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v15, -2.0, v0, s[14:15] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[9:10] +; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[11:12] +; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[15:16] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[17:18] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[13:14] +; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v11, -2.0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[4:5] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[19:20] +; GCN-NEXT: v_cndmask_b32_e32 v15, -2.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index c69b0cce3d208..bb220fb59a182 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -454,6 +454,7 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v31, s67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s2, 1 +; GCN-NEXT: v_mov_b32_e32 v34, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 @@ -484,10 +485,9 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v29, s65 ; GCN-NEXT: v_mov_b32_e32 v30, s66 ; GCN-NEXT: v_movrels_b32_e32 v32, v1 +; GCN-NEXT: v_mov_b32_e32 v33, s0 ; GCN-NEXT: v_movrels_b32_e32 v31, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] +; GCN-NEXT: flat_store_dwordx2 v[33:34], v[31:32] ; GCN-NEXT: s_endpgm entry: %ext = extractelement <15 x double> , i32 %sel @@ -537,6 +537,7 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v31, s67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s2, 1 +; GCN-NEXT: v_mov_b32_e32 v34, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 @@ -567,10 +568,9 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v29, s65 ; GCN-NEXT: v_mov_b32_e32 v30, s66 ; GCN-NEXT: v_movrels_b32_e32 v32, v1 +; GCN-NEXT: v_mov_b32_e32 v33, s0 ; GCN-NEXT: v_movrels_b32_e32 v31, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] +; GCN-NEXT: flat_store_dwordx2 v[33:34], v[31:32] ; GCN-NEXT: s_endpgm entry: %ext = extractelement <16 x double> , i32 %sel @@ -581,13 +581,15 @@ entry: define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float32_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v33, s1 ; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v32, s0 ; GCN-NEXT: v_mov_b32_e32 v3, 4.0 ; GCN-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; GCN-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -617,10 +619,8 @@ define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v29, 0x41f00000 ; GCN-NEXT: v_mov_b32_e32 v30, 0x41f80000 ; GCN-NEXT: v_mov_b32_e32 v31, 0x42000000 -; GCN-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-NEXT: flat_store_dword v[32:33], v0 ; GCN-NEXT: s_endpgm entry: %ext = extractelement <32 x float> , i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index dcfac6fdbfc77..bdcaba33ca00d 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -738,79 +738,80 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v17, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 5fb50d0d89530..fe4687712a06e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3327,50 +3327,50 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v19 -; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v19 -; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v20 +; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v21 ; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v19 -; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v20 +; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v3, v21 ; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v19 -; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v19 -; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v4, v4, v20 +; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v5, v21 ; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v19 -; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v19 -; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v20 +; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v7, v7, v21 ; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v19 -; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v21, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v10, v10, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v11, v11, v11 ; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v11, v11, v19 ; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v8, v8, v20 +; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v9, v9, v21 +; VI-NEXT: v_max_f16_e32 v10, v10, v10 +; VI-NEXT: v_max_f16_sdwa v21, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v15, v15, v15 ; VI-NEXT: v_max_f16_e32 v14, v14, v14 ; VI-NEXT: v_max_f16_e32 v13, v13, v13 ; VI-NEXT: v_max_f16_e32 v12, v12, v12 +; VI-NEXT: v_max_f16_e32 v11, v11, v11 +; VI-NEXT: v_or_b32_e32 v10, v10, v20 +; VI-NEXT: v_or_b32_e32 v11, v11, v21 ; VI-NEXT: v_or_b32_e32 v12, v12, v19 ; VI-NEXT: v_or_b32_e32 v13, v13, v18 ; VI-NEXT: v_or_b32_e32 v14, v14, v17 @@ -3497,104 +3497,104 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v64f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v33, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v2, v2, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; VI-NEXT: v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v31 -; VI-NEXT: v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v31 -; VI-NEXT: v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v31 +; VI-NEXT: v_or_b32_e32 v0, v0, v31 ; VI-NEXT: v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v32 +; VI-NEXT: v_max_f16_sdwa v32, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v31 -; VI-NEXT: v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v31 +; VI-NEXT: v_or_b32_e32 v3, v3, v31 ; VI-NEXT: v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v4, v4, v32 ; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v31 -; VI-NEXT: v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v31 +; VI-NEXT: v_or_b32_e32 v5, v5, v31 ; VI-NEXT: v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v32 ; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v31 -; VI-NEXT: v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v31 +; VI-NEXT: v_or_b32_e32 v7, v7, v31 ; VI-NEXT: v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v8, v8, v32 ; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v31 -; VI-NEXT: v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v10, v10, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v31 +; VI-NEXT: v_or_b32_e32 v9, v9, v31 ; VI-NEXT: v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v10, v10, v32 ; VI-NEXT: v_max_f16_e32 v11, v11, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v31 -; VI-NEXT: v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v12, v12, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v31 +; VI-NEXT: v_or_b32_e32 v11, v11, v31 ; VI-NEXT: v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v12, v12, v32 ; VI-NEXT: v_max_f16_e32 v13, v13, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v31 -; VI-NEXT: v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v14, v14, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v31 +; VI-NEXT: v_or_b32_e32 v13, v13, v31 ; VI-NEXT: v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v14, v14, v32 ; VI-NEXT: v_max_f16_e32 v15, v15, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v31 -; VI-NEXT: v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v16, v16, v16 -; VI-NEXT: v_or_b32_e32 v16, v16, v31 +; VI-NEXT: v_or_b32_e32 v15, v15, v31 ; VI-NEXT: v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 ; VI-NEXT: v_max_f16_e32 v17, v17, v17 -; VI-NEXT: v_or_b32_e32 v17, v17, v31 -; VI-NEXT: v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v18, v18, v18 -; VI-NEXT: v_or_b32_e32 v18, v18, v31 +; VI-NEXT: v_or_b32_e32 v17, v17, v31 ; VI-NEXT: v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v18, v18, v32 ; VI-NEXT: v_max_f16_e32 v19, v19, v19 -; VI-NEXT: v_or_b32_e32 v19, v19, v31 -; VI-NEXT: v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v20, v20, v20 -; VI-NEXT: v_or_b32_e32 v20, v20, v31 +; VI-NEXT: v_or_b32_e32 v19, v19, v31 ; VI-NEXT: v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v20, v20, v32 ; VI-NEXT: v_max_f16_e32 v21, v21, v21 -; VI-NEXT: v_or_b32_e32 v21, v21, v31 -; VI-NEXT: v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v22, v22, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v31 +; VI-NEXT: v_or_b32_e32 v21, v21, v31 ; VI-NEXT: v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v22, v22, v32 ; VI-NEXT: v_max_f16_e32 v23, v23, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v31 -; VI-NEXT: v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v24, v24, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v31 +; VI-NEXT: v_or_b32_e32 v23, v23, v31 ; VI-NEXT: v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v24, v24, v32 ; VI-NEXT: v_max_f16_e32 v25, v25, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v31 -; VI-NEXT: v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v26, v26, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v31 +; VI-NEXT: v_or_b32_e32 v25, v25, v31 ; VI-NEXT: v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v26, v26, v32 ; VI-NEXT: v_max_f16_e32 v27, v27, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v31 -; VI-NEXT: v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v28, v28, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v31 +; VI-NEXT: v_or_b32_e32 v27, v27, v31 ; VI-NEXT: v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 ; VI-NEXT: v_max_f16_e32 v29, v29, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v31 -; VI-NEXT: v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v30, v30, v30 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: v_or_b32_e32 v29, v29, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v31, v31, v31 -; VI-NEXT: v_or_b32_e32 v31, v31, v32 +; VI-NEXT: v_max_f16_sdwa v31, v33, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: v_max_f16_e32 v32, v33, v33 +; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v64f16: @@ -3668,52 +3668,41 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v5, v6, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -3728,80 +3717,113 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:4 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v14 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; CI-NEXT: v_or_b32_e32 v7, v8, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v18 -; CI-NEXT: v_or_b32_e32 v8, v10, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v8, v11, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v30 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_or_b32_e32 v9, v10, v9 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; CI-NEXT: v_or_b32_e32 v10, v14, v10 -; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_or_b32_e32 v17, v18, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_or_b32_e32 v13, v16, v13 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_or_b32_e32 v19, v20, v19 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 -; CI-NEXT: v_or_b32_e32 v20, v22, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_or_b32_e32 v11, v12, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_or_b32_e32 v12, v15, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v22 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_or_b32_e32 v21, v22, v21 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v15, v16, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v16, v18, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v26 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; CI-NEXT: v_or_b32_e32 v31, v32, v31 ; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0 ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_or_b32_e32 v18, v20, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_or_b32_e32 v13, v14, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v22, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_or_b32_e32 v10, v10, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; CI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; CI-NEXT: v_or_b32_e32 v9, v14, v9 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3966,49 +3988,28 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; CI-NEXT: v_or_b32_e32 v12, v12, v15 -; CI-NEXT: v_or_b32_e32 v11, v16, v11 -; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 -; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; CI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v31, v21, s[0:3], 0 offen +; CI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; CI-NEXT: buffer_store_dword v9, v19, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; CI-NEXT: v_or_b32_e32 v21, v21, v22 +; CI-NEXT: buffer_store_dword v21, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v20, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; CI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; CI-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; CI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; CI-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; CI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 ; CI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll index bd1f98a39c252..18429dd932d64 100644 --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -403,349 +403,348 @@ define amdgpu_kernel void @fceil_v16f64(ptr addrspace(1) %out, <16 x double> %x) ; SI-LABEL: fceil_v16f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x29 -; SI-NEXT: s_mov_b32 s26, -1 -; SI-NEXT: s_mov_b32 s29, 0xfffff -; SI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x9 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_mov_b32 s28, s26 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xfffff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_and_b32 s2, s11, 0x80000000 -; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s3 +; SI-NEXT: s_and_b32 s3, s11, 0x80000000 +; SI-NEXT: s_add_i32 s24, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s24 ; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s24, 0 ; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s2, s1 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s3, s11, s1 -; SI-NEXT: s_cselect_b32 s2, s10, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 +; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s39, s11, s1 +; SI-NEXT: s_cselect_b32 s38, s10, s0 ; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[10:11], 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s38 +; SI-NEXT: v_mov_b32_e32 v1, s39 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[0:1] ; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s43, 0x3ff00000, 0 ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s7 +; SI-NEXT: s_and_b32 s3, s9, 0x80000000 +; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s6, s1 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s7, s9, s1 -; SI-NEXT: s_cselect_b32 s6, s8, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s11, s9, s1 +; SI-NEXT: s_cselect_b32 s10, s8, s0 ; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[8:9], 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[0:1] ; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s27, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s44, 0x3ff00000, 0 ; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s9 +; SI-NEXT: s_and_b32 s3, s15, 0x80000000 +; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 ; SI-NEXT: s_andn2_b64 s[0:1], s[14:15], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s8, s1 -; SI-NEXT: s_cmp_gt_i32 s9, 51 -; SI-NEXT: s_cselect_b32 s9, s15, s1 -; SI-NEXT: s_cselect_b32 s8, s14, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[14:15], 0 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_cselect_b32 s1, s3, s1 +; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s1, s15, s1 +; SI-NEXT: s_cselect_b32 s0, s14, s0 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[14:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s14, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: s_and_b32 s10, s13, 0x80000000 -; SI-NEXT: s_add_i32 s15, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s15 -; SI-NEXT: s_andn2_b64 s[0:1], s[12:13], s[0:1] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s13, 0xb0014 +; SI-NEXT: s_and_b32 s14, s13, 0x80000000 +; SI-NEXT: s_add_i32 s15, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s15 +; SI-NEXT: s_andn2_b64 s[8:9], s[12:13], s[8:9] ; SI-NEXT: s_cmp_lt_i32 s15, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s10, s1 -; SI-NEXT: v_cmp_gt_f64_e64 s[10:11], s[12:13], 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s14, s9 ; SI-NEXT: s_cmp_gt_i32 s15, 51 -; SI-NEXT: s_cselect_b32 s1, s13, s1 -; SI-NEXT: s_cselect_b32 s0, s12, s0 -; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_cselect_b32 s25, s13, s9 +; SI-NEXT: s_cselect_b32 s24, s12, s8 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[12:13], 0 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[0:1] -; SI-NEXT: s_and_b64 s[2:3], s[10:11], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 -; SI-NEXT: s_and_b32 s11, s19, 0x80000000 -; SI-NEXT: s_add_i32 s12, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s12 -; SI-NEXT: s_andn2_b64 s[2:3], s[18:19], s[2:3] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 +; SI-NEXT: s_add_i32 s12, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s12 +; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s13, s19, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s12, 0 -; SI-NEXT: s_cselect_b32 s13, 0, s2 -; SI-NEXT: s_cselect_b32 s11, s11, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[18:19], 0 +; SI-NEXT: s_cselect_b32 s9, s13, s9 +; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s31, s19, s11 -; SI-NEXT: s_cselect_b32 s30, s18, s13 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s14 -; SI-NEXT: v_mov_b32_e32 v4, s30 -; SI-NEXT: v_mov_b32_e32 v5, s31 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 -; SI-NEXT: s_and_b32 s6, s17, 0x80000000 -; SI-NEXT: s_add_i32 s7, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s7 -; SI-NEXT: s_andn2_b64 s[2:3], s[16:17], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: s_cselect_b32 s11, 0, s2 -; SI-NEXT: s_cselect_b32 s6, s6, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[16:17], 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s19, s17, s6 -; SI-NEXT: s_cselect_b32 s18, s16, s11 -; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s36, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s23, 0xb0014 -; SI-NEXT: s_and_b32 s6, s23, 0x80000000 -; SI-NEXT: s_add_i32 s7, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s7 -; SI-NEXT: s_andn2_b64 s[2:3], s[22:23], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: s_cselect_b32 s6, s6, s3 -; SI-NEXT: s_cselect_b32 s8, 0, s2 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[22:23], 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s35, s23, s6 +; SI-NEXT: s_cselect_b32 s27, s19, s9 +; SI-NEXT: s_cselect_b32 s26, s18, s8 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[18:19], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[0:1] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s40, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 +; SI-NEXT: s_add_i32 s12, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s12 +; SI-NEXT: s_andn2_b64 s[8:9], s[16:17], s[8:9] +; SI-NEXT: s_and_b32 s13, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s9, s13, s9 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s29, s17, s9 +; SI-NEXT: s_cselect_b32 s28, s16, s8 +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[16:17], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[0:1] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s41, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 +; SI-NEXT: s_add_i32 s12, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s12 +; SI-NEXT: s_andn2_b64 s[8:9], s[22:23], s[8:9] +; SI-NEXT: s_and_b32 s13, s23, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s9, s13, s9 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s35, s23, s9 ; SI-NEXT: s_cselect_b32 s34, s22, s8 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s34 -; SI-NEXT: v_mov_b32_e32 v10, s35 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[9:10] -; SI-NEXT: s_and_b64 s[0:1], s[2:3], vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_mov_b32_e32 v1, s35 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[22:23], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[0:1] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s42, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s21, 0xb0014 +; SI-NEXT: s_add_i32 s12, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s12 +; SI-NEXT: s_andn2_b64 s[8:9], s[20:21], s[8:9] +; SI-NEXT: s_and_b32 s13, s21, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s9, s13, s9 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s31, s21, s9 +; SI-NEXT: s_cselect_b32 s30, s20, s8 +; SI-NEXT: v_mov_b32_e32 v0, s30 +; SI-NEXT: v_mov_b32_e32 v1, s31 +; SI-NEXT: v_cmp_gt_f64_e64 s[36:37], s[20:21], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_add_f64 v[2:3], s[38:39], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s44 +; SI-NEXT: v_add_f64 v[0:1], s[10:11], v[4:5] +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x39 +; SI-NEXT: s_and_b64 s[36:37], s[36:37], vcc +; SI-NEXT: s_and_b64 s[36:37], s[36:37], exec +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_cselect_b32 s38, 0x3ff00000, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[36:37], s[10:11], s[36:37] +; SI-NEXT: s_and_b32 s39, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s37, s39, s37 +; SI-NEXT: s_cselect_b32 s36, 0, s36 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s37, s11, s37 +; SI-NEXT: s_cselect_b32 s36, s10, s36 +; SI-NEXT: v_add_f64 v[8:9], s[0:1], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s36 +; SI-NEXT: v_mov_b32_e32 v6, s37 +; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[10:11], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[5:6] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s37, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s21, 0xb0014 -; SI-NEXT: s_and_b32 s2, s21, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v5, s33 +; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 ; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[20:21], s[0:1] +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s1, s2, s1 +; SI-NEXT: s_cselect_b32 s1, s10, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s17, s21, s1 -; SI-NEXT: s_cselect_b32 s16, s20, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[20:21], 0 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_mov_b32_e32 v10, s17 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[9:10] -; SI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x39 -; SI-NEXT: s_mov_b32 s27, 0xf000 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16 -; SI-NEXT: v_mov_b32_e32 v9, s33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[30:31], v[8:9] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_gt_f64_e64 s[20:21], s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v9, s36 -; SI-NEXT: v_cmp_gt_f64_e64 s[30:31], s[0:1], 0 -; SI-NEXT: v_add_f64 v[4:5], s[18:19], v[8:9] -; SI-NEXT: v_cmp_gt_f64_e64 s[18:19], s[6:7], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:32 -; SI-NEXT: v_mov_b32_e32 v9, s37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[34:35], v[8:9] -; SI-NEXT: v_cmp_gt_f64_e64 s[34:35], s[4:5], 0 -; SI-NEXT: s_and_b64 s[22:23], s[22:23], vcc -; SI-NEXT: s_and_b64 s[22:23], s[22:23], exec -; SI-NEXT: s_cselect_b32 s22, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s23, s3, 0xb0014 -; SI-NEXT: s_and_b32 s33, s3, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: s_add_i32 s36, s23, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[22:23], s[28:29], s36 -; SI-NEXT: s_andn2_b64 s[22:23], s[2:3], s[22:23] -; SI-NEXT: s_cmp_lt_i32 s36, 0 -; SI-NEXT: s_cselect_b32 s38, 0, s22 -; SI-NEXT: s_cselect_b32 s33, s33, s23 -; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[10:11], 0 -; SI-NEXT: s_cmp_gt_i32 s36, 51 -; SI-NEXT: s_cselect_b32 s37, s3, s33 -; SI-NEXT: s_cselect_b32 s36, s2, s38 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s36 -; SI-NEXT: v_mov_b32_e32 v10, s37 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[9:10] -; SI-NEXT: s_and_b64 s[2:3], s[20:21], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014 -; SI-NEXT: s_and_b32 s16, s1, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: s_add_i32 s17, s3, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: s_cselect_b32 s20, 0, s2 -; SI-NEXT: s_cselect_b32 s16, s16, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[8:9], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:48 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s17, s1, s16 -; SI-NEXT: s_cselect_b32 s16, s0, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[36:37], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[0:1], v[4:5] -; SI-NEXT: s_and_b64 s[0:1], s[30:31], vcc +; SI-NEXT: s_cselect_b32 s11, s9, s1 +; SI-NEXT: s_cselect_b32 s10, s8, s0 +; SI-NEXT: v_add_f64 v[6:7], s[24:25], v[4:5] +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[8:9], 0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[10:11] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; SI-NEXT: s_and_b32 s20, s7, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s0 -; SI-NEXT: s_add_i32 s21, s1, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s21 -; SI-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s21, 0 -; SI-NEXT: s_cselect_b32 s30, 0, s0 -; SI-NEXT: s_cselect_b32 s20, s20, s1 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: s_cselect_b32 s24, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[0:1], s[14:15], s[0:1] +; SI-NEXT: s_and_b32 s8, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s1, s8, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s9, s15, s1 +; SI-NEXT: s_cselect_b32 s8, s14, s0 +; SI-NEXT: v_add_f64 v[12:13], s[26:27], v[4:5] +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[14:15], 0 -; SI-NEXT: s_cmp_gt_i32 s21, 51 -; SI-NEXT: s_cselect_b32 s21, s7, s20 -; SI-NEXT: s_cselect_b32 s20, s6, s30 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[9:10] -; SI-NEXT: s_and_b64 s[6:7], s[18:19], vcc -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s7, s5, 0xb0014 -; SI-NEXT: s_and_b32 s16, s5, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: s_add_i32 s17, s7, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: s_cselect_b32 s18, 0, s6 -; SI-NEXT: s_cselect_b32 s16, s16, s7 -; SI-NEXT: v_cmp_gt_f64_e64 s[6:7], s[12:13], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:64 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s17, s5, s16 -; SI-NEXT: s_cselect_b32 s16, s4, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[20:21], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[34:35], vcc +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[10:11] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: s_cselect_b32 s25, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[0:1], s[12:13], s[0:1] +; SI-NEXT: s_and_b32 s14, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s1, s14, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s15, s13, s1 +; SI-NEXT: s_cselect_b32 s14, s12, s0 +; SI-NEXT: v_add_f64 v[10:11], s[28:29], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_add_f64 v[16:17], s[34:35], v[4:5] +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[14:15] +; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[12:13], 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s26, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s0, s19, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[0:1], s[18:19], s[0:1] +; SI-NEXT: s_and_b32 s12, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s1, s12, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s13, s19, s1 +; SI-NEXT: s_cselect_b32 s12, s18, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[14:15] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_mov_b32_e32 v5, s38 +; SI-NEXT: v_cmp_gt_f64_e64 s[4:5], s[18:19], 0 +; SI-NEXT: v_add_f64 v[14:15], s[30:31], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s33 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s5, s11, 0xb0014 -; SI-NEXT: s_and_b32 s18, s11, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: s_add_i32 s16, s5, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], s16 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: s_cselect_b32 s18, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 +; SI-NEXT: s_and_b32 s19, s17, 0x80000000 +; SI-NEXT: s_add_i32 s27, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s27 +; SI-NEXT: s_andn2_b64 s[4:5], s[16:17], s[4:5] +; SI-NEXT: s_cmp_lt_i32 s27, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s18, s5 -; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s5, s11, s5 -; SI-NEXT: s_cselect_b32 s4, s10, s4 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:80 +; SI-NEXT: s_cselect_b32 s5, s19, s5 +; SI-NEXT: s_cmp_gt_i32 s27, 51 +; SI-NEXT: s_cselect_b32 s5, s17, s5 +; SI-NEXT: s_cselect_b32 s4, s16, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[4:5] -; SI-NEXT: s_and_b64 s[10:11], s[22:23], vcc +; SI-NEXT: v_add_f64 v[8:9], s[36:37], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[5:6] +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_add_f64 v[6:7], s[10:11], v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 s[10:11], s[16:17], 0 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], vcc ; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s11, s9, 0xb0014 -; SI-NEXT: s_and_b32 s16, s9, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: s_add_i32 s17, s11, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[10:11] -; SI-NEXT: s_cmp_lt_i32 s17, 0 +; SI-NEXT: s_cselect_b32 s16, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s10, s23, 0xb0014 +; SI-NEXT: s_and_b32 s17, s23, 0x80000000 +; SI-NEXT: s_add_i32 s19, s10, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], s19 +; SI-NEXT: s_andn2_b64 s[10:11], s[22:23], s[10:11] +; SI-NEXT: s_cmp_lt_i32 s19, 0 ; SI-NEXT: s_cselect_b32 s10, 0, s10 -; SI-NEXT: s_cselect_b32 s11, s16, s11 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s11, s9, s11 -; SI-NEXT: s_cselect_b32 s10, s8, s10 -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_and_b32 s4, s15, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: s_add_i32 s5, s3, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s5 -; SI-NEXT: s_andn2_b64 s[2:3], s[14:15], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s5, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s4, s3 -; SI-NEXT: s_cmp_gt_i32 s5, 51 -; SI-NEXT: s_cselect_b32 s3, s15, s3 -; SI-NEXT: s_cselect_b32 s2, s14, s2 -; SI-NEXT: v_add_f64 v[4:5], s[10:11], v[8:9] -; SI-NEXT: v_mov_b32_e32 v10, s3 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[9:10] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: s_and_b32 s5, s13, 0x80000000 -; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s8 -; SI-NEXT: s_andn2_b64 s[0:1], s[12:13], s[0:1] +; SI-NEXT: s_cselect_b32 s11, s17, s11 +; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cselect_b32 s11, s23, s11 +; SI-NEXT: s_cselect_b32 s10, s22, s10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[10:11] +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[22:23], 0 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_add_f64 v[10:11], s[14:15], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s14, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s8, s21, 0xb0014 +; SI-NEXT: s_and_b32 s9, s21, 0x80000000 +; SI-NEXT: s_addk_i32 s8, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 +; SI-NEXT: s_andn2_b64 s[6:7], s[20:21], s[6:7] ; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s5, s1 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s9, s7 ; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s1, s13, s1 -; SI-NEXT: s_cselect_b32 s0, s12, s0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:96 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_cselect_b32 s7, s21, s7 +; SI-NEXT: s_cselect_b32 s6, s20, s6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SI-NEXT: v_add_f64 v[16:17], s[12:13], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[5:6] +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[20:21], 0 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_add_f64 v[14:15], s[4:5], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: s_and_b64 s[4:5], s[8:9], vcc ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[6:7], s[2:3], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:112 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0 +; SI-NEXT: v_add_f64 v[6:7], s[10:11], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96 +; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 450d66767600b..7f98c2c6021c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1490,79 +1490,79 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; GCN-NEXT: v_mul_f32_e64 v16, 1.0, s1 ; GCN-NEXT: v_mul_f32_e64 v17, 1.0, s0 ; GCN-NEXT: v_mul_f32_e64 v18, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s14 +; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v20, 1.0, s14 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v20, 16, 15 ; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GCN-NEXT: v_or_b32_e32 v15, v19, v15 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s15 +; GCN-NEXT: v_or_b32_e32 v15, v20, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GCN-NEXT: v_or_b32_e32 v14, v19, v14 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s12 +; GCN-NEXT: v_or_b32_e32 v14, v20, v14 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GCN-NEXT: v_or_b32_e32 v13, v19, v13 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s13 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s13 +; GCN-NEXT: v_or_b32_e32 v13, v20, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s10 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s10 +; GCN-NEXT: v_or_b32_e32 v12, v20, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s11 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s11 +; GCN-NEXT: v_or_b32_e32 v11, v20, v11 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s8 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; GCN-NEXT: v_or_b32_e32 v10, v20, v10 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s9 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s9 +; GCN-NEXT: v_or_b32_e32 v9, v20, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; GCN-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s6 +; GCN-NEXT: v_or_b32_e32 v8, v20, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_or_b32_e32 v7, v19, v7 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s7 +; GCN-NEXT: v_or_b32_e32 v7, v20, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s4 +; GCN-NEXT: v_or_b32_e32 v6, v20, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_or_b32_e32 v5, v19, v5 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v21, 1.0, s5 +; GCN-NEXT: v_or_b32_e32 v5, v20, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 ; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 ; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 ; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v4, v20, v4 ; GCN-NEXT: v_or_b32_e32 v3, v19, v3 ; GCN-NEXT: v_or_b32_e32 v2, v18, v2 ; GCN-NEXT: v_or_b32_e32 v1, v17, v1 @@ -1596,94 +1596,94 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; GFX7-LABEL: s_copysign_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s30 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s14 ; GFX7-NEXT: v_mul_f32_e64 v14, 1.0, s31 +; GFX7-NEXT: v_mul_f32_e64 v20, 1.0, s14 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GFX7-NEXT: v_bfe_u32 v20, v20, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v21, 1.0, s15 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; GFX7-NEXT: v_or_b32_e32 v15, v20, v15 ; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v14, v19, v14 +; GFX7-NEXT: v_bfe_u32 v20, v21, 16, 15 ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; GFX7-NEXT: v_or_b32_e32 v14, v20, v14 +; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s29 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e64 v20, 1.0, s12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s12 -; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s29 ; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GFX7-NEXT: v_bfe_u32 v15, v20, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v20, 1.0, s13 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s13 ; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v12 +; GFX7-NEXT: v_bfe_u32 v15, v20, 16, 15 ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s26 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v12 +; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s27 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s10 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_or_b32_e32 v12, v13, v12 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s10 -; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s27 ; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 +; GFX7-NEXT: v_bfe_u32 v13, v15, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s11 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s11 ; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v10, v13, v10 +; GFX7-NEXT: v_bfe_u32 v13, v15, 16, 15 ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; GFX7-NEXT: v_or_b32_e32 v10, v13, v10 +; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s25 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s8 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s8 -; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s25 ; GFX7-NEXT: v_and_b32_e32 v9, 0x8000, v9 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GFX7-NEXT: v_bfe_u32 v11, v13, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_or_b32_e32 v9, v11, v9 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s9 ; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GFX7-NEXT: v_bfe_u32 v11, v13, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s23 ; GFX7-NEXT: v_or_b32_e32 v8, v11, v8 ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s23 +; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v9, v11, 16, 15 +; GFX7-NEXT: v_bfe_u32 v9, v15, 16, 15 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s20 +; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; GFX7-NEXT: v_bfe_u32 v11, v13, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v6, v9, v6 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; GFX7-NEXT: v_mul_f32_e64 v20, 1.0, s4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; GFX7-NEXT: v_or_b32_e32 v7, v11, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e64 v21, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s5 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v7, v13, 16, 15 +; GFX7-NEXT: v_bfe_u32 v7, v20, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v7, v15, 16, 15 +; GFX7-NEXT: v_bfe_u32 v7, v21, 16, 15 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e64 v18, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; GFX7-NEXT: v_bfe_u32 v5, v19, 16, 15 @@ -1693,11 +1693,11 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e64 v17, 1.0, s0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e64 v16, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_bfe_u32 v3, v17, 16, 15 @@ -2270,12 +2270,6 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-LABEL: v_copysign_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GCN-NEXT: v_and_b32_e32 v30, 0x8000, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 @@ -2290,6 +2284,11 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-NEXT: v_or_b32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_or_b32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 @@ -2312,11 +2311,8 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 @@ -2340,6 +2336,8 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 ; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 ; GCN-NEXT: v_and_b32_e32 v26, 0x8000, v26 ; GCN-NEXT: v_and_b32_e32 v25, 0x8000, v25 @@ -2352,6 +2350,7 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-NEXT: v_and_b32_e32 v18, 0x8000, v18 ; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v17 ; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 ; GCN-NEXT: v_or_b32_e32 v10, v10, v26 ; GCN-NEXT: v_or_b32_e32 v9, v9, v25 ; GCN-NEXT: v_or_b32_e32 v8, v8, v24 @@ -2363,8 +2362,9 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-NEXT: v_or_b32_e32 v2, v2, v18 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_e32 v0, v0, v16 +; GCN-NEXT: v_or_b32_e32 v14, v14, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v28 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -2389,21 +2389,21 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GFX7-LABEL: v_copysign_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v27, 0x8000, v27 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_and_b32_e32 v29, 0x8000, v29 +; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 @@ -2413,18 +2413,18 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 @@ -2438,19 +2438,19 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v30, 0x8000, v30 ; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v29, 0x8000, v29 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v28, 0x8000, v28 ; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 +; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 ; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 ; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GFX7-NEXT: v_and_b32_e32 v27, 0x8000, v27 ; GFX7-NEXT: v_and_b32_e32 v26, 0x8000, v26 ; GFX7-NEXT: v_and_b32_e32 v25, 0x8000, v25 ; GFX7-NEXT: v_and_b32_e32 v23, 0x8000, v23 @@ -2470,13 +2470,13 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GFX7-NEXT: v_and_b32_e32 v16, 0x8000, v16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v14, v14, v30 -; GFX7-NEXT: v_or_b32_e32 v13, v13, v29 ; GFX7-NEXT: v_or_b32_e32 v12, v12, v28 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v27 ; GFX7-NEXT: v_or_b32_e32 v10, v10, v26 ; GFX7-NEXT: v_or_b32_e32 v9, v9, v25 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v29 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 ; GFX7-NEXT: v_or_b32_e32 v15, v15, v24 @@ -2572,11 +2572,11 @@ define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_bfe_u32 v32, v32, 16, 15 -; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GCN-NEXT: v_bfe_u32 v31, v32, 16, 15 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v33 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: v_or_b32_e32 v31, v31, v32 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_bfe_u32 v30, v30, 16, 15 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index ba4fe3685458d..15bf6d4b7f3ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1579,116 +1579,116 @@ define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x hal define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v16f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v16, s31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s30 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s30 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s12 -; SI-NEXT: v_bfi_b32 v16, s0, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 -; SI-NEXT: v_bfi_b32 v18, s0, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s11 -; SI-NEXT: v_bfi_b32 v17, s0, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: v_bfi_b32 v17, s0, v18, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_bfi_b32 v18, s0, v19, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s12 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s27 +; SI-NEXT: v_bfi_b32 v18, s0, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_bfi_b32 v16, s0, v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s10 -; SI-NEXT: v_or_b32_e32 v12, v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 -; SI-NEXT: v_bfi_b32 v13, s0, v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_bfi_b32 v15, s0, v18, v15 -; SI-NEXT: v_bfi_b32 v14, s0, v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_bfi_b32 v11, s0, v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_bfi_b32 v18, s0, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_bfi_b32 v15, s0, v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s9 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s8 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s23 +; SI-NEXT: v_bfi_b32 v18, s0, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s22 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s6 +; SI-NEXT: v_bfi_b32 v14, s0, v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_bfi_b32 v15, s0, v15, v19 -; SI-NEXT: v_bfi_b32 v10, s0, v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_bfi_b32 v18, s0, v20, v19 +; SI-NEXT: v_bfi_b32 v10, s0, v11, v10 ; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 ; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v14, s0, v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1696,10 +1696,10 @@ define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x ; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: v_readfirstlane_b32 s2, v8 ; SI-NEXT: v_readfirstlane_b32 s3, v9 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v16f16: @@ -2040,78 +2040,78 @@ define <16 x half> @v_copysign_v16f16(<16 x half> %mag, <16 x half> %sign) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v17 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v18 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v16 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_bfi_b32 v6, s4, v6, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_bfi_b32 v8, s4, v8, v17 +; SI-NEXT: v_bfi_b32 v8, s4, v8, v16 ; SI-NEXT: v_bfi_b32 v9, s4, v9, v18 ; SI-NEXT: v_bfi_b32 v10, s4, v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v21 ; SI-NEXT: v_bfi_b32 v11, s4, v11, v20 -; SI-NEXT: v_bfi_b32 v12, s4, v12, v17 +; SI-NEXT: v_bfi_b32 v12, s4, v12, v16 ; SI-NEXT: v_bfi_b32 v13, s4, v13, v18 ; SI-NEXT: v_bfi_b32 v14, s4, v14, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v15, s4, v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_bfi_b32 v15, s4, v15, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index cdd34cbde6ddd..bb107b48eb191 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -170,52 +170,52 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 ; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 ; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 ; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 ; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 ; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 ; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 ; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 ; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 ; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -294,68 +294,68 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB0_9: ; %Flow3 @@ -535,52 +535,52 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 ; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 ; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 ; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 ; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 ; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 ; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 ; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 ; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 ; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -659,68 +659,68 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB1_9: ; %Flow3 @@ -893,52 +893,52 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 ; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 ; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 ; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 ; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 ; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 ; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 ; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 ; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 ; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1010,68 +1010,68 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB2_9: ; %Flow3 @@ -1244,52 +1244,52 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 ; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 ; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 ; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 ; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 ; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 ; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 ; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 ; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 ; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1361,68 +1361,68 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB3_9: ; %Flow3 @@ -1746,68 +1746,68 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB6_9: ; %Flow3 @@ -2093,68 +2093,68 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 ; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 ; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 ; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 ; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 ; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 -; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 -; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 -; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 -; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 -; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 -; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 -; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v21, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v22, 21, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 23, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v21, v22 +; GISEL-NEXT: v_or3_b32 v2, v2, v21, v22 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 25, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 27, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 29, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 -; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v11, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB7_9: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 308e86bbaf8fd..f406d1945821c 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -1595,30 +1595,29 @@ define void @freeze_v13i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v13i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dword v18, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[8:9], 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[8:9], 0, v1, s[8:9] +; GFX8-GISEL-NEXT: flat_load_dword v0, v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v18, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[6:7], 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v19, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v3, s[6:7] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dword v[2:3], v18 +; GFX8-GISEL-NEXT: flat_store_dword v[20:21], v0 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,30 +1806,29 @@ define void @freeze_v14i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v14i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[8:9], 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[8:9], 0, v1, s[8:9] ; GFX8-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v18, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[6:7], 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v19, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v3, s[6:7] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-GISEL-NEXT: flat_store_dwordx2 v[20:21], v[0:1] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2029,15 +2027,15 @@ define void @freeze_v15i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx3 v[16:18], v[0:1] +; GFX8-GISEL-NEXT: flat_load_dwordx3 v[16:18], v[12:13] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) @@ -2049,9 +2047,8 @@ define void @freeze_v15i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[16:18] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2241,31 +2238,31 @@ define void @freeze_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v16i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[6:7], 48, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v17, vcc, 0, v1, s[6:7] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[4:5], 16, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v4, s[4:5], 48, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2466,35 +2463,37 @@ define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v17i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dword v20, v[18:19] +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: flat_load_dword v24, v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v22, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v23, vcc, 0, v3, s[6:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GFX8-GISEL-NEXT: flat_store_dword v[2:3], v20 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dword v[2:3], v24 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2710,35 +2709,36 @@ define void @freeze_v18i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v18i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[18:19] +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v22, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v24, s[6:7], 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v23, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v25, vcc, 0, v3, s[6:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2861,24 +2861,20 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SDAG-NEXT: s_mov_b32 s4, s6 ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 -; GFX6-SDAG-NEXT: buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72 -; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX6-SDAG-NEXT: buffer_load_dword v20, v[0:1], s[4:7], 0 addr64 offset:72 +; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:64 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX6-SDAG-NEXT: buffer_store_dword v20, v[2:3], s[4:7], 0 addr64 offset:72 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2890,20 +2886,21 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX6-GISEL-NEXT: buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX6-GISEL-NEXT: buffer_load_dword v20, v[0:1], s[4:7], 0 addr64 offset:72 +; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX6-GISEL-NEXT: buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX6-GISEL-NEXT: buffer_store_dword v20, v[2:3], s[4:7], 0 addr64 offset:72 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2914,24 +2911,20 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-SDAG-NEXT: s_mov_b32 s4, s6 ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 -; GFX7-SDAG-NEXT: buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72 -; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-SDAG-NEXT: buffer_load_dword v20, v[0:1], s[4:7], 0 addr64 offset:72 +; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:64 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX7-SDAG-NEXT: buffer_store_dword v20, v[2:3], s[4:7], 0 addr64 offset:72 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2962,37 +2955,38 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v19i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GFX8-GISEL-NEXT: flat_load_dwordx3 v[20:22], v[20:21] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx3 v[20:22], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v23, s[4:5], 32, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-GISEL-NEXT: s_nop 0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[23:24], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx3 v[6:7], v[20:22] +; GFX8-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[20:22] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3210,11 +3204,11 @@ define void @freeze_v20i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -3222,24 +3216,23 @@ define void @freeze_v20i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v24, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v26, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v28, s[8:9], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v25, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v27, vcc, 0, v3, s[6:7] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v29, vcc, 0, v3, s[8:9] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[26:27], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[28:29], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3363,8 +3356,8 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dword v24, v[0:1], s[4:7], 0 addr64 offset:80 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) @@ -3372,10 +3365,10 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dword v24, v[2:3], s[4:7], 0 addr64 offset:80 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16 @@ -3418,8 +3411,8 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dword v24, v[0:1], s[4:7], 0 addr64 offset:80 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) @@ -3427,10 +3420,10 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dword v24, v[2:3], s[4:7], 0 addr64 offset:80 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16 @@ -3467,45 +3460,46 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v21i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dword v26, v[8:9] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e64 v12, s[4:5], 32, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v17, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[4:5], 64, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v22, 0x50 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], v0, v22 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: flat_load_dword v28, v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v26, s[6:7], 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v27, vcc, 0, v3, s[6:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x50, v2 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x50, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] -; GFX8-GISEL-NEXT: flat_store_dword v[6:7], v26 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[26:27], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dword v[2:3], v28 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3644,8 +3638,8 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[24:25], v[0:1], s[4:7], 0 addr64 offset:80 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) @@ -3653,10 +3647,10 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[24:25], v[2:3], s[4:7], 0 addr64 offset:80 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16 @@ -3699,8 +3693,8 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[24:25], v[0:1], s[4:7], 0 addr64 offset:80 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) @@ -3708,10 +3702,10 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[24:25], v[2:3], s[4:7], 0 addr64 offset:80 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16 @@ -3748,45 +3742,46 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v22i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx2 v[24:25], v[8:9] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v26, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v27, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[4:5], 64, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v22, 0x50 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e64 v0, s[4:5], v0, v22 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v1, s[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v26, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v28, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v27, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v29, vcc, 0, v3, s[6:7] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x50, v2 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] -; GFX8-GISEL-NEXT: flat_store_dwordx2 v[6:7], v[24:25] +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x50, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[26:27], v[12:15] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3925,8 +3920,8 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 @@ -3936,10 +3931,10 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48 @@ -3992,8 +3987,8 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 @@ -4003,10 +3998,10 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48 @@ -4055,60 +4050,55 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 64, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v48, 0x50 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v34, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v34 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v48 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v18, 0x60 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 -; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 +; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v18 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v19, 0x70 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x70 -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v19 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v32, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[32:33], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v32, vcc, v2, v34 -; GFX8-GISEL-NEXT: v_add_u32_e64 v34, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v38, s[8:9], 64, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v39, vcc, 0, v3, s[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e64 v32, s[4:5], 16, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v34, s[6:7], 32, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v33, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v3, s[6:7] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, v2, v48 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[32:33], v[4:7] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[34:35], v[8:11] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(6) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[38:39], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[16:19] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[34:35], v[20:23] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[14:15], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -4279,31 +4269,25 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s4, s6 ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 -; GFX6-SDAG-NEXT: buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120 -; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:80 +; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64 +; GFX6-SDAG-NEXT: buffer_load_dword v32, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:112 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX6-SDAG-NEXT: buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX6-SDAG-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:112 +; GFX6-SDAG-NEXT: buffer_store_dword v32, v[2:3], s[4:7], 0 addr64 offset:120 +; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:80 ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX6-SDAG-NEXT: buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4316,29 +4300,30 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX6-GISEL-NEXT: buffer_load_dword v32, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80 -; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96 +; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:112 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(6) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96 -; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 -; GFX6-GISEL-NEXT: buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:112 +; GFX6-GISEL-NEXT: buffer_store_dword v32, v[2:3], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4350,31 +4335,25 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s4, s6 ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80 -; GFX7-SDAG-NEXT: buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120 -; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:80 +; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64 +; GFX7-SDAG-NEXT: buffer_load_dword v32, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:112 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX7-SDAG-NEXT: buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) -; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX7-SDAG-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:112 +; GFX7-SDAG-NEXT: buffer_store_dword v32, v[2:3], s[4:7], 0 addr64 offset:120 +; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:80 ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) ; GFX7-SDAG-NEXT: buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4417,60 +4396,61 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v10, 0x70 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v32, vcc, v0, v10 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v35, 0x50 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v35 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v18, 0x60 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 -; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x70 -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] -; GFX8-GISEL-NEXT: flat_load_dwordx3 v[32:34], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v35 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v18 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] +; GFX8-GISEL-NEXT: flat_load_dwordx3 v[32:34], v[32:33] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v2, v35 +; GFX8-GISEL-NEXT: v_add_u32_e64 v35, s[4:5], 16, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v36, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e64 v37, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: s_mov_b64 s[10:11], vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v48, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v50, s[8:9], 64, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v38, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v49, s[4:5], 0, v3, s[6:7] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v51, s[4:5], 0, v3, s[8:9] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v3, s[10:11] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[35:36], v[4:7] +; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: s_mov_b64 s[12:13], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(6) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], 0, v3, s[12:13] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[37:38], v[8:11] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[48:49], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[50:51], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[24:27] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[32:34] +; GFX8-GISEL-NEXT: flat_store_dwordx3 v[6:7], v[32:34] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4769,61 +4749,59 @@ define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v37, 0x50 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v37 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v22, 0x60 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 -; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v22 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v23, 0x70 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x70 -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v23 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v36, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v38, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v48, s[8:9], 64, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v2, v37 +; GFX8-GISEL-NEXT: s_mov_b64 s[10:11], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: s_mov_b64 s[12:13], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v39, s[4:5], 0, v3, s[6:7] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v49, s[4:5], 0, v3, s[8:9] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v3, s[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], 0, v3, s[12:13] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[38:39], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[48:49], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[24:27] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[32:35] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -7877,31 +7855,31 @@ define void @freeze_v8f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-LABEL: freeze_v8f64: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[6:7], 48, v0 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v17, vcc, 0, v1, s[6:7] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: v_add_u32_e64 v20, s[4:5], 16, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v4, s[4:5], 48, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8574,29 +8552,28 @@ define void @freeze_v8p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8835,61 +8812,59 @@ define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v37, 0x50 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v37 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v22, 0x60 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 -; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v22 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v23, 0x70 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x70 -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v23 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v36, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v38, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v48, s[8:9], 64, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v2, v37 +; GFX8-GISEL-NEXT: s_mov_b64 s[10:11], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: s_mov_b64 s[12:13], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v39, s[4:5], 0, v3, s[6:7] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v49, s[4:5], 0, v3, s[8:9] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v3, s[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], 0, v3, s[12:13] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[38:39], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[48:49], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[24:27] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[32:35] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9622,29 +9597,28 @@ define void @freeze_v8p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9883,61 +9857,59 @@ define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v37, 0x50 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v37 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v22, 0x60 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 -; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v22 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v23, 0x70 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x70 -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v23 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e64 v36, s[4:5], 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v38, s[6:7], 48, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v48, s[8:9], 64, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, v2, v37 +; GFX8-GISEL-NEXT: s_mov_b64 s[10:11], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: s_mov_b64 s[12:13], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v39, s[4:5], 0, v3, s[6:7] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v49, s[4:5], 0, v3, s[8:9] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, v3, s[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], 0, v3, s[12:13] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[8:11] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[38:39], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[48:49], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[24:27] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[32:35] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -11534,70 +11506,72 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-LABEL: freeze_v16p5: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 4, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX6-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX6-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX6-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; GFX6-SDAG-NEXT: v_add_i32_e32 v20, vcc, 16, v1 +; GFX6-SDAG-NEXT: v_add_i32_e32 v21, vcc, 20, v1 +; GFX6-SDAG-NEXT: v_add_i32_e32 v22, vcc, 24, v1 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX6-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) +; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1 +; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 32, v1 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX6-SDAG-NEXT: buffer_store_dword v14, v18, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) -; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1 -; GFX6-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 36, v1 +; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 40, v1 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX6-SDAG-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) -; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 44, v1 ; GFX6-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v12, v20, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v7, v14, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v6, v18, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1 ; GFX6-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) @@ -11617,78 +11591,77 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GFX6-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 12, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 16, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v7, vcc, 24, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; GFX6-GISEL-NEXT: buffer_load_dword v20, v0, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; GFX6-GISEL-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v15, vcc, 52, v0 ; GFX6-GISEL-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v16, vcc, 56, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; GFX6-GISEL-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v17, vcc, 4, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v18, vcc, 8, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v19, vcc, 12, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v16, vcc, 4, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v17, vcc, 8, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v18, vcc, 12, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v19, vcc, 16, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v21, vcc, 20, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v22, vcc, 24, v1 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen -; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v3, vcc, 24, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 28, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v16, vcc, 32, v1 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v3, v17, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v3, vcc, 36, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v17, vcc, 40, v1 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 +; GFX6-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 44, v1 +; GFX6-GISEL-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v11, v17, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 +; GFX6-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 56, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -11696,68 +11669,68 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-LABEL: freeze_v16p5: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 4, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX7-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX7-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX7-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen -; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1 -; GFX7-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: v_add_i32_e32 v20, vcc, 16, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v21, vcc, 20, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v22, vcc, 24, v1 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX7-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 32, v1 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX7-SDAG-NEXT: buffer_store_dword v14, v18, s[0:3], 0 offen +; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 36, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 40, v1 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(8) +; GFX7-SDAG-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 44, v1 ; GFX7-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v12, v20, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v7, v14, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v6, v18, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1 ; GFX7-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 52, v1 @@ -11775,74 +11748,74 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GFX7-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 12, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 16, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 24, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; GFX7-GISEL-NEXT: buffer_load_dword v20, v0, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; GFX7-GISEL-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 52, v0 ; GFX7-GISEL-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 56, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; GFX7-GISEL-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 4, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, 8, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, 12, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 4, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 8, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, 12, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, 16, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v21, vcc, 20, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v22, vcc, 24, v1 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, 24, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 28, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 32, v1 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v3, v17, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, 36, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 40, v1 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 44, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v11, v17, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 56, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -11852,74 +11825,74 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, 8, v0 -; GFX8-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 12, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 20, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 24, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 28, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 36, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 40, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 44, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 36, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 40, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 44, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 48, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 52, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 56, v0 +; GFX8-GISEL-NEXT: buffer_load_dword v20, v0, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 48, v0 ; GFX8-GISEL-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 52, v0 ; GFX8-GISEL-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 56, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; GFX8-GISEL-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 4, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 8, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, 12, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 4, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 8, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 12, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v21, vcc, 20, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v22, vcc, 24, v1 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 20, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, 24, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 28, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 28, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 32, v1 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v3, v17, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, 36, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 40, v1 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 40, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, 36, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 44, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 44, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v11, v17, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 52, v1 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 52, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 56, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, 60, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -12937,7 +12910,6 @@ define void @freeze_v16i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX6-GISEL-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX6-GISEL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX6-GISEL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX6-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-GISEL-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX6-GISEL-NEXT: v_and_b32_e32 v13, 0xff, v13 @@ -12946,6 +12918,7 @@ define void @freeze_v16i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX6-GISEL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX6-GISEL-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX6-GISEL-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX6-GISEL-NEXT: v_and_b32_e32 v14, 0xff, v14 @@ -13014,7 +12987,6 @@ define void @freeze_v16i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-GISEL-NEXT: v_and_b32_e32 v13, 0xff, v13 @@ -13023,6 +12995,7 @@ define void @freeze_v16i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-GISEL-NEXT: v_and_b32_e32 v14, 0xff, v14 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 76e15eed08cc2..22d5c8166b8c5 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -3192,129 +3192,129 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GFX6-SDAG-LABEL: v_sqrt_v3f64: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: s_mov_b32 s6, 0 -; GFX6-SDAG-NEXT: s_brev_b32 s7, 8 -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GFX6-SDAG-NEXT: v_mov_b32_e32 v10, 0x100 -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: s_brev_b32 s5, 8 +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_mov_b32_e32 v14, 0x100 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v15, 0xffffff80 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v16, 0x260 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc ; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] -; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7] -; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] -; GFX6-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[6:7] ; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; GFX6-SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] -; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; GFX6-SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13] -; GFX6-SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] -; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3] -; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11] -; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3] -; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15] -; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17] -; GFX6-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] -; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 -; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; GFX6-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 -; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 -; GFX6-SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 -; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v16 +; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc +; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] +; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v16 +; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc +; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[4:5] +; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[4:5], v[6:7] +; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[4:5] +; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[4:5] +; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[4:5], v16 +; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_sqrt_v3f64: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: s_mov_b32 s6, 0 -; GFX8-SDAG-NEXT: s_brev_b32 s7, 8 -; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GFX8-SDAG-NEXT: v_mov_b32_e32 v10, 0x100 -; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] -; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc +; GFX8-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX8-SDAG-NEXT: s_brev_b32 s5, 8 +; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-SDAG-NEXT: v_mov_b32_e32 v14, 0x100 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v15, 0xffffff80 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v16, 0x260 +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc ; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] -; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7] -; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] -; GFX8-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] +; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[6:7] ; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; GFX8-SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] -; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; GFX8-SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] -; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3] -; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11] -; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3] -; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15] -; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17] -; GFX8-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; GFX8-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] -; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 -; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; GFX8-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 -; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 -; GFX8-SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 -; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v16 +; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc +; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] +; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v16 +; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v14, vcc +; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[4:5] +; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[4:5], v[6:7] +; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[4:5] +; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[4:5] +; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v15, vcc +; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[4:5], v16 +; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-GISEL-LABEL: v_sqrt_v3f64: @@ -3323,66 +3323,66 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX6-GISEL-NEXT: s_brev_b32 s5, 8 ; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-GISEL-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] -; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] -; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 +; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], 0.5 ; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9] -; GFX6-GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5 -; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11] -; GFX6-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5 -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5 -; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; GFX6-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5 -; GFX6-GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13] -; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5 -; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX6-GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] -; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] -; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9] -; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] -; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11] -; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13] -; GFX6-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] -; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], 0.5 +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc ; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 -; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 -; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX6-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] +; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], 0.5 +; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], 0.5 +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc +; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[6:7] +; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[4:5] +; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], v[6:7] +; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[4:5] +; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[8:9], v[6:7] +; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[4:5] +; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[8:9], v[6:7] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[4:5], v15 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_sqrt_v3f64: @@ -3391,66 +3391,66 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GFX8-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8-GISEL-NEXT: s_brev_b32 s5, 8 ; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] -; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] -; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 +; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], 0.5 ; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9] -; GFX8-GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5 -; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5 -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5 -; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] -; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; GFX8-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5 -; GFX8-GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13] -; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5 -; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] -; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3] -; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] -; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] -; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9] -; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3] -; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] -; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11] -; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] -; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], 0.5 +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1] +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc ; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 -; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 -; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX8-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] +; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], 0.5 +; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], 0.5 +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[12:13], v[10:11], v[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc +; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[4:5] +; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], v[6:7] +; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[4:5] +; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[8:9], v[6:7] +; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[4:5] +; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[8:9], v[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[4:5], v15 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x) ret <3 x double> %result diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3c41cc43a089e..5b18c74dcd530 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -663,70 +663,25 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 { ; 1 over register limit define void @void_func_v33i32(<33 x i32> %arg0) #0 { -; CI-LABEL: void_func_v33i32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v33i32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v33i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v33i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(7) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dword v32, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v33i32: ; GFX11: ; %bb.0: @@ -1508,72 +1463,71 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-LABEL: void_func_v32i8: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; CI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; CI-NEXT: v_or_b32_e32 v8, v8, v9 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; CI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; CI-NEXT: v_or_b32_e32 v8, v8, v9 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; CI-NEXT: v_and_b32_e32 v9, 0xff, v14 -; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; CI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; CI-NEXT: v_or_b32_e32 v12, v12, v13 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; CI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; CI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v14, 8, v29 +; CI-NEXT: v_and_b32_e32 v15, 0xff, v28 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; CI-NEXT: v_and_b32_e32 v28, 0xff, v30 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_or_b32_e32 v1, v15, v14 +; CI-NEXT: v_or_b32_e32 v6, v7, v6 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v29 -; CI-NEXT: v_and_b32_e32 v14, 0xff, v28 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; CI-NEXT: v_or_b32_e32 v10, v11, v10 +; CI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; CI-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v1, v4, v6 ; CI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; CI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v1, v9 -; CI-NEXT: v_or_b32_e32 v9, v11, v10 -; CI-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; CI-NEXT: v_or_b32_e32 v6, v7, v6 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27 -; CI-NEXT: v_and_b32_e32 v27, 0xff, v30 -; CI-NEXT: v_or_b32_e32 v13, v14, v13 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v7, v3, v2 -; CI-NEXT: v_or_b32_e32 v3, v10, v1 -; CI-NEXT: v_or_b32_e32 v1, v4, v6 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; CI-NEXT: v_or_b32_e32 v11, v15, v14 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; CI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; CI-NEXT: v_or_b32_e32 v5, v5, v13 ; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; CI-NEXT: v_or_b32_e32 v0, v0, v7 -; CI-NEXT: v_or_b32_e32 v2, v8, v9 -; CI-NEXT: v_and_b32_e32 v8, 0xff, v20 -; CI-NEXT: v_and_b32_e32 v9, 0xff, v16 +; CI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; CI-NEXT: v_or_b32_e32 v13, v27, v15 +; CI-NEXT: v_or_b32_e32 v3, v11, v5 +; CI-NEXT: v_or_b32_e32 v2, v8, v10 +; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v23 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 ; CI-NEXT: s_mov_b64 s[4:5], 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; CI-NEXT: v_or_b32_e32 v5, v24, v25 -; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; CI-NEXT: v_or_b32_e32 v4, v4, v26 -; CI-NEXT: v_or_b32_e32 v6, v5, v11 -; CI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; CI-NEXT: v_or_b32_e32 v4, v4, v14 ; CI-NEXT: v_or_b32_e32 v7, v12, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v24, v25 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_or_b32_e32 v6, v4, v13 +; CI-NEXT: v_or_b32_e32 v4, v5, v8 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v20 ; CI-NEXT: v_or_b32_e32 v5, v8, v5 ; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; CI-NEXT: v_and_b32_e32 v8, 0xff, v18 @@ -1582,6 +1536,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; CI-NEXT: v_or_b32_e32 v4, v4, v8 ; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; CI-NEXT: v_and_b32_e32 v9, 0xff, v16 ; CI-NEXT: v_or_b32_e32 v8, v9, v8 ; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; CI-NEXT: v_or_b32_e32 v4, v8, v4 @@ -1594,22 +1549,22 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX89-LABEL: void_func_v32i8: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 -; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX89-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v15 +; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v5 ; GFX89-NEXT: v_lshlrev_b16_e32 v7, 8, v7 ; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX89-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; GFX89-NEXT: v_lshlrev_b16_e32 v14, 8, v25 -; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v27 +; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; GFX89-NEXT: v_lshlrev_b16_e32 v25, 8, v27 ; GFX89-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX89-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; GFX89-NEXT: v_lshlrev_b16_e32 v17, 8, v17 @@ -1620,8 +1575,8 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX89-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v10, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v11, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1631,12 +1586,12 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v6, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v5, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v10 +; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v14 ; GFX89-NEXT: v_or_b32_sdwa v8, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 @@ -2967,10 +2922,10 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_mul_f32_e32 v16, 1.0, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v33 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v33 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 @@ -2978,14 +2933,14 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 1, v34 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3228,42 +3183,42 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v38 +; CI-NEXT: v_mul_f32_e32 v8, 1.0, v32 +; CI-NEXT: v_mul_f32_e32 v9, 1.0, v33 +; CI-NEXT: v_mul_f32_e32 v10, 1.0, v34 +; CI-NEXT: v_mul_f32_e32 v11, 1.0, v35 +; CI-NEXT: v_mul_f32_e32 v12, 1.0, v36 +; CI-NEXT: v_mul_f32_e32 v13, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32 -; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33 -; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34 -; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35 -; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 -; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v25, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; CI-NEXT: buffer_store_short v11, off, s[4:7], 0 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; CI-NEXT: buffer_store_short v15, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v14, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v5, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3386,9 +3341,9 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 @@ -3423,9 +3378,9 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 @@ -3460,10 +3415,10 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 @@ -3619,9 +3574,9 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3631,30 +3586,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3668,9 +3623,9 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3680,30 +3635,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3717,9 +3672,9 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3729,31 +3684,31 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3821,16 +3776,19 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 +; CI-NEXT: s_waitcnt vmcnt(10) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3841,52 +3799,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[39:42], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3894,16 +3857,19 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3914,52 +3880,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3967,16 +3938,20 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3987,55 +3962,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[39:42], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[52:55], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[48:51], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4323,10 +4300,10 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; CI-NEXT: s_waitcnt vmcnt(7) @@ -4334,50 +4311,52 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4385,8 +4364,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: void_func_v32i32_v16i8: @@ -4396,10 +4373,10 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -4407,50 +4384,52 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ubyte v25, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v21, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v22, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ubyte v23, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4458,8 +4437,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: void_func_v32i32_v16i8: @@ -4469,10 +4446,10 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -4480,51 +4457,55 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v24, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ubyte v25, off, s[0:3], s32 offset:24 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v21, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v22, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ubyte v23, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v24, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v21, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v25, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v23, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v22, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4532,8 +4513,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: void_func_v32i32_v16i8: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 38003f6075c35..170dff1bf1ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1110,10 +1110,10 @@ define <16 x i16> @v16i16_func_void() #0 { ; CI-NEXT: v_mov_b32_e32 v0, v22 ; CI-NEXT: v_mov_b32_e32 v2, v23 ; CI-NEXT: v_mov_b32_e32 v4, v24 -; CI-NEXT: v_mov_b32_e32 v6, v25 ; CI-NEXT: v_mov_b32_e32 v8, v18 ; CI-NEXT: v_mov_b32_e32 v10, v19 ; CI-NEXT: v_mov_b32_e32 v12, v20 +; CI-NEXT: v_mov_b32_e32 v6, v25 ; CI-NEXT: v_mov_b32_e32 v14, v21 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1316,7 +1316,6 @@ define <33 x i32> @v33i32_func_void() #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 @@ -1325,58 +1324,55 @@ define <33 x i32> @v33i32_func_void() #0 { ; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 -; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 ; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x64, v0 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x60, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 @@ -1388,7 +1384,6 @@ define <33 x i32> @v33i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; CI-NEXT: s_waitcnt vmcnt(14) ; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -1406,7 +1401,6 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 @@ -1415,58 +1409,55 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 -; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(8) ; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x64, v0 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x60, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 -; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x4c, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 36, v0 ; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 ; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 @@ -1478,7 +1469,6 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) ; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 ; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -1499,20 +1489,20 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 @@ -1523,25 +1513,22 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(26) ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 @@ -1599,7 +1586,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 @@ -1608,58 +1594,55 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 -; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 ; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x64, v0 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x60, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 @@ -1671,7 +1654,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; CI-NEXT: s_waitcnt vmcnt(14) ; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -1689,7 +1671,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 @@ -1698,58 +1679,55 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 -; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(8) ; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x64, v0 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x60, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 -; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x4c, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 36, v0 ; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 ; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 @@ -1761,7 +1739,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) ; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 ; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -1782,20 +1759,20 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 @@ -1806,25 +1783,22 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(26) ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 @@ -1880,8 +1854,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0xe0, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 @@ -1890,60 +1864,56 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 ; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 -; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v33, vcc, 0xfc, v0 -; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0xf8, v0 ; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v3, vcc, 0xf4, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xec, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xe8, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 0xe4, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xec, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xe4, v0 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xe0, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xd8, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xd4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xdc, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xd8, v0 +; CI-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xd4, v0 +; CI-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v4, vcc, 0xd0, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0xcc, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0xc8, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0xb8, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0xc4, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xbc, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xcc, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0xc8, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0xc4, v0 +; CI-NEXT: v_add_i32_e32 v33, vcc, 0xc0, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xbc, v0 +; CI-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0 +; CI-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xb0, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 0xac, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xa8, v0 +; CI-NEXT: v_add_i32_e32 v12, vcc, 0xa4, v0 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0xa0, v0 ; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 0xb4, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 0xb0, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 0xac, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 0xa8, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 0xa4, v0 -; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v5, vcc, 0xa0, v0 -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 ; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x98, v0 @@ -1953,7 +1923,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x90, v0 ; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 -; CI-NEXT: s_waitcnt vmcnt(14) ; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 ; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -1970,8 +1939,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0xe0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 @@ -1980,60 +1949,56 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 ; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 -; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0xfc, v0 -; GFX8-NEXT: s_waitcnt vmcnt(8) ; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf8, v0 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xf4, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf0, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0xe4, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xec, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe8, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xe4, v0 -; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v6, v33, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd0, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xcc, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xc8, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xb8, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xc4, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc0, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xbc, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xcc, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xc8, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xc4, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0xc0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xb0, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xac, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xa8, v0 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xa4, v0 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0xa0, v0 ; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xb4, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xb0, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xac, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xa8, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xa4, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v33, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v12, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x9c, v0 ; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x98, v0 @@ -2043,7 +2008,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x90, v0 ; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x8c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) ; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x88, v0 ; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen @@ -2065,20 +2029,20 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:252 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:236 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:224 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 @@ -2089,25 +2053,22 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:172 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:164 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: s_waitcnt vmcnt(26) ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:156 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:148 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:140 ; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:132 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index f67ab18dd8ef1..5d5ca6812a530 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5698,11 +5698,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 16 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: v_mov_b32_e32 v42, 16 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[40:41], off -; GFX9-NEXT: global_load_dwordx4 v[16:19], v[42:43], off +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: global_load_dwordx4 v[32:35], v[42:43], off ; GFX9-NEXT: v_writelane_b32 v44, s34, 2 ; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi @@ -5710,13 +5710,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -5726,27 +5722,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v35 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 ; GFX9-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-NEXT: v_mov_b32_e32 v20, v17 -; GFX9-NEXT: v_mov_b32_e32 v24, v18 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v2, v36 -; GFX9-NEXT: v_mov_b32_e32 v3, v37 -; GFX9-NEXT: v_mov_b32_e32 v17, v32 -; GFX9-NEXT: v_mov_b32_e32 v18, v33 -; GFX9-NEXT: v_mov_b32_e32 v19, v34 +; GFX9-NEXT: v_mov_b32_e32 v1, v20 +; GFX9-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-NEXT: v_mov_b32_e32 v3, v24 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v34 +; GFX9-NEXT: v_mov_b32_e32 v28, v35 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 @@ -5756,25 +5754,25 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v4, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index b750d28ffa7d3..954b69743fe18 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2394,124 +2394,128 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-LABEL: return_72xi32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 +; GFX10-NEXT: s_clause 0xd +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:140 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108 -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:76 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:52 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: s_clause 0x8 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: s_waitcnt vmcnt(20) +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:252 +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:216 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:204 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:184 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:256 -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:252 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3187,7 +3191,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:1616 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3199,15 +3203,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s36, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:16 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:16 ; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:12 ; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:4 @@ -3232,7 +3235,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: s_add_i32 s2, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v60, s30, 0 +; GFX11-NEXT: v_writelane_b32 v47, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0 @@ -3253,16 +3256,17 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 ; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo -; GFX11-NEXT: v_writelane_b32 v60, s31, 1 +; GFX11-NEXT: v_writelane_b32 v47, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:624 +; GFX11-NEXT: s_add_i32 s2, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s33 offset:1536 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 ; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 -; GFX11-NEXT: s_add_i32 s2, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_mov_b32_e32 v32, v48 -; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 +; GFX11-NEXT: v_mov_b32_e32 v32, v3 +; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 ; GFX11-NEXT: scratch_load_b128 v[37:40], off, s33 offset:688 ; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:704 @@ -3272,105 +3276,105 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_mov_b32_e32 v46, v13 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v45, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1600 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 +; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: v_dual_mov_b32 v6, v9 :: v_dual_mov_b32 v7, v10 ; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v9, v20 ; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1584 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51 -; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53 -; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55 -; GFX11-NEXT: v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40 -; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56 -; GFX11-NEXT: v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44 -; GFX11-NEXT: v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12 -; GFX11-NEXT: v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59 -; GFX11-NEXT: v_mov_b32_e32 v58, v13 -; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0 -; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 -; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v9, v20 +; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v48, v51 +; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v52, v40 +; GFX11-NEXT: v_mov_b32_e32 v54, v42 +; GFX11-NEXT: v_mov_b32_e32 v40, v44 +; GFX11-NEXT: v_mov_b32_e32 v42, v57 +; GFX11-NEXT: v_mov_b32_e32 v44, v59 ; GFX11-NEXT: scratch_store_b32 off, v11, s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v11, v22 +; GFX11-NEXT: v_dual_mov_b32 v32, v36 :: v_dual_mov_b32 v35, v50 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x80 ; GFX11-NEXT: v_mov_b32_e32 v5, v16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 -; GFX11-NEXT: v_mov_b32_e32 v0, 24 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v11, v22 ; GFX11-NEXT: s_add_i32 s2, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v6, v17 +; GFX11-NEXT: v_mov_b32_e32 v50, v53 ; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x6c -; GFX11-NEXT: v_mov_b32_e32 v7, v18 +; GFX11-NEXT: v_dual_mov_b32 v36, v55 :: v_dual_mov_b32 v53, v41 +; GFX11-NEXT: v_mov_b32_e32 v55, v43 +; GFX11-NEXT: v_mov_b32_e32 v41, v56 +; GFX11-NEXT: v_mov_b32_e32 v43, v58 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 -; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2 +; GFX11-NEXT: v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v13, v24 +; GFX11-NEXT: scratch_store_b96 off, v[44:46], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 64 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 +; GFX11-NEXT: v_mov_b32_e32 v12, v23 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 48 ; GFX11-NEXT: v_mov_b32_e32 v14, v25 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 +; GFX11-NEXT: v_mov_b32_e32 v15, v26 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 16 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: v_mov_b32_e32 v16, v27 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 -; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 -; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1600 +; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1584 +; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1568 +; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1552 +; GFX11-NEXT: scratch_load_b128 v[29:32], off, s33 offset:1536 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: s_clause 0xa ; GFX11-NEXT: scratch_load_b32 v59, off, s33 ; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:20 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:24 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:28 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:32 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:36 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 -; GFX11-NEXT: v_readlane_b32 s31, v60, 1 -; GFX11-NEXT: v_readlane_b32 s30, v60, 0 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:24 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:28 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:32 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:36 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:40 +; GFX11-NEXT: v_readlane_b32 s31, v47, 1 +; GFX11-NEXT: v_readlane_b32 s30, v47, 0 ; GFX11-NEXT: s_mov_b32 s32, s34 ; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:1616 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index e532deaca98a8..cfab0b1dee489 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -246,71 +246,69 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x7 -; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112 -; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96 -; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80 -; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:48 -; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32 -; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 -; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 +; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16 +; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off +; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:48 +; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:32 +; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:80 +; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64 +; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112 +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:96 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 16 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 48 ; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[14:15], v[14:15] ; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 -; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off +; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[18:21], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x5 -; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17 +; GCN-SDAG-NEXT: global_store_b128 v[64:65], v[22:25], off ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] -; GCN-SDAG-NEXT: s_wait_loadcnt 0x4 -; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[18:21], off -; GCN-SDAG-NEXT: s_wait_loadcnt 0x3 -; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[22:25], off -; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 -; GCN-SDAG-NEXT: global_store_b128 v[64:65], v[26:29], off -; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[66:67], v[30:33], off +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[64:65], v[22:23], v[22:23] +; GCN-SDAG-NEXT: v_mov_b64_e32 v[22:23], 0x60 +; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[14:17], off ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7] +; GCN-SDAG-NEXT: v_mov_b64_e32 v[14:15], 0x70 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[54:55], v[20:21], v[20:21] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[18:19], v[18:19] +; GCN-SDAG-NEXT: v_mov_b64_e32 v[18:19], 32 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[32:33], v[32:33] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[66:67], v[24:25], v[24:25] ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], v[2:3], v[2:3] +; GCN-SDAG-NEXT: global_store_b128 v[14:15], v[30:33], off +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[32:33], 0x50 +; GCN-SDAG-NEXT: global_store_b128 v[22:23], v[0:3], off +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 64 +; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v28 :: v_dual_mov_b32 v37, v29 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[22:23], v[0:1], v[0:1] +; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v54 :: v_dual_mov_b32 v1, v55 +; GCN-SDAG-NEXT: global_store_b128 v[18:19], v[6:9], off +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[30:31], v[30:31] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[16:17], v[16:17] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[12:13], v[12:13] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[10:11], v[10:11] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[82:83], v[28:29], v[28:29] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[80:81], 0xc8, v[26:27] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[70:71], 0x64, v[8:9] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[68:69], v[6:7], v[6:7] ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off -; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off +; GCN-SDAG-NEXT: global_store_b128 v[32:33], v[34:37], off +; GCN-SDAG-NEXT: global_store_b128 v[2:3], v[10:13], off ; GCN-SDAG-NEXT: s_clause 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:64 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:80 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:32 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16 -; GCN-SDAG-NEXT: s_wait_xcnt 0x8 -; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:96 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:112 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:64 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[80:83], off offset:80 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[68:71], off offset:32 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[64:67], off offset:48 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[52:55], off +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:16 ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GCN-GISEL-LABEL: test_v16i64_load_store: diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 8e427a6ef2023..07f13f5712ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1477,57 +1477,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: s_add_u32 s2, s0, 48 -; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v20, s1 +; CI-NEXT: v_mov_b32_e32 v19, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: flat_store_dwordx4 v[6:7], v[9:12] +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; CI-NEXT: v_mov_b32_e32 v10, s3 +; CI-NEXT: v_mov_b32_e32 v22, s1 +; CI-NEXT: v_mov_b32_e32 v9, s2 +; CI-NEXT: v_mov_b32_e32 v21, s0 +; CI-NEXT: flat_store_dwordx4 v[19:20], v[11:14] +; CI-NEXT: flat_store_dwordx4 v[9:10], v[15:18] +; CI-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -1547,40 +1547,40 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v21, s1 +; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v20, s0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f32: @@ -2033,43 +2033,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: v_mov_b32_e32 v19, s1 +; CI-NEXT: v_mov_b32_e32 v18, s0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_mov_b32_e32 v23, s1 +; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: flat_store_dwordx4 v[0:1], v[14:17] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[22:23], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[18:19], v[2:5] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: @@ -2084,39 +2084,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v8, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v12, s0 +; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v23, s1 +; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v22, s0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; VI-NEXT: v_mov_b32_e32 v17, s1 -; VI-NEXT: v_mov_b32_e32 v16, s0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: global_extload_v8f16_to_v8f64: @@ -2214,80 +2214,80 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 -; CI-NEXT: s_add_u32 s2, s0, 16 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x70 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v18, s1 +; CI-NEXT: v_mov_b32_e32 v17, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v10 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v7 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v22 +; CI-NEXT: flat_store_dwordx4 v[1:2], v[12:15] +; CI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[13:14], v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v1 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v20, s3 +; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; CI-NEXT: v_mov_b32_e32 v19, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[11:12], v3 +; CI-NEXT: flat_store_dwordx4 v[19:20], v[7:10] +; CI-NEXT: s_add_u32 s2, s0, 0x70 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; CI-NEXT: v_mov_b32_e32 v22, s3 +; CI-NEXT: v_mov_b32_e32 v21, s2 +; CI-NEXT: flat_store_dwordx4 v[17:18], v[11:14] ; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[17:18], v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v15 +; CI-NEXT: v_cvt_f64_f32_e32 v[15:16], v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_mov_b32_e32 v20, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v19, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v24 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v0 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; CI-NEXT: v_mov_b32_e32 v24, s3 +; CI-NEXT: v_mov_b32_e32 v26, s1 +; CI-NEXT: v_mov_b32_e32 v23, s2 +; CI-NEXT: v_mov_b32_e32 v25, s0 +; CI-NEXT: flat_store_dwordx4 v[21:22], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; CI-NEXT: flat_store_dwordx4 v[23:24], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[25:26], v[1:4] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f64: @@ -2299,85 +2299,84 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v14, s3 -; VI-NEXT: v_mov_b32_e32 v13, s2 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v16, s3 -; VI-NEXT: v_mov_b32_e32 v15, s2 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f32_f16_sdwa v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v18 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v20 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v21 +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v18, s3 -; VI-NEXT: v_mov_b32_e32 v17, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 -; VI-NEXT: v_mov_b32_e32 v12, s1 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v11, s0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; VI-NEXT: v_mov_b32_e32 v14, s3 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_mov_b32_e32 v13, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v24, v7 +; VI-NEXT: flat_store_dwordx4 v[1:2], v[12:15] +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v22 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; VI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v0 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_add_u32 s2, s0, 64 +; VI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: v_mov_b32_e32 v16, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_mov_b32_e32 v15, s2 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v13 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v5 +; VI-NEXT: v_cvt_f64_f32_e32 v[16:17], v0 ; VI-NEXT: s_add_u32 s0, s0, 0x60 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v24 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 -; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v14, s1 -; VI-NEXT: v_mov_b32_e32 v19, s2 -; VI-NEXT: v_mov_b32_e32 v13, s0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] +; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; VI-NEXT: v_mov_b32_e32 v23, s3 +; VI-NEXT: v_mov_b32_e32 v25, s1 +; VI-NEXT: v_mov_b32_e32 v22, s2 +; VI-NEXT: v_mov_b32_e32 v24, s0 +; VI-NEXT: flat_store_dwordx4 v[20:21], v[6:9] +; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] +; VI-NEXT: flat_store_dwordx4 v[22:23], v[10:13] +; VI-NEXT: flat_store_dwordx4 v[24:25], v[1:4] ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f64: @@ -2937,18 +2936,19 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v8, s2 +; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_add_u32 s4, s2, 48 ; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v9, s3 ; CI-NEXT: v_mov_b32_e32 v4, s4 -; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s5 @@ -2956,51 +2956,50 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: v_mov_b32_e32 v13, s3 ; CI-NEXT: v_mov_b32_e32 v12, s2 -; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; CI-NEXT: flat_load_dwordx4 v[10:13], v[12:13] +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 ; CI-NEXT: v_or_b32_e32 v2, v17, v7 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_or_b32_e32 v1, v10, v6 +; CI-NEXT: v_or_b32_e32 v1, v16, v6 ; CI-NEXT: v_or_b32_e32 v0, v8, v7 -; CI-NEXT: v_or_b32_e32 v3, v14, v9 -; CI-NEXT: v_or_b32_e32 v2, v12, v11 ; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_or_b32_e32 v3, v12, v9 +; CI-NEXT: v_or_b32_e32 v2, v10, v11 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; @@ -3008,18 +3007,19 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 48 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 @@ -3027,43 +3027,42 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 -; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f16_f32_sdwa v16, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; VI-NEXT: flat_load_dwordx4 v[10:13], v[12:13] +; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f16_f32_sdwa v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 -; VI-NEXT: v_or_b32_e32 v2, v18, v17 +; VI-NEXT: v_or_b32_e32 v2, v18, v15 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_or_b32_e32 v1, v10, v11 +; VI-NEXT: v_or_b32_e32 v1, v17, v16 ; VI-NEXT: v_or_b32_e32 v0, v8, v9 -; VI-NEXT: v_or_b32_e32 v3, v14, v15 -; VI-NEXT: v_or_b32_e32 v2, v12, v13 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; VI-NEXT: v_or_b32_e32 v3, v12, v13 +; VI-NEXT: v_or_b32_e32 v2, v10, v11 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index b9d3763e7def1..4d8a22e86535f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2642,73 +2642,73 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 +; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 +; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 -; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 -; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 +; GFX8-NEXT: v_mul_lo_u16_e32 v2, v3, v2 +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v19, v17, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v14 +; GFX8-NEXT: v_or_b32_sdwa v6, v20, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v10, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v8, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v9, v9, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v9 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 -; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 50f0a39802270..3dce59ae092e0 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2504,37 +2504,38 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 -; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 ; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 ; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 +; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 -; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 ; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17 -; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 -; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 +; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v3, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v2, v2, 20, 4 +; GFX8-NEXT: v_mul_lo_u16_e32 v20, v10, v17 +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v19, v11, v18 ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v9, v18, v9 -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v2, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v20, v3 +; GFX8-NEXT: v_or_b32_e32 v9, v19, v2 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v3, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 @@ -2545,11 +2546,10 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v12 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f5d7bb3a45fe1..3696946ea8653 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1884,80 +1884,81 @@ entry: define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: insert_w_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0xb ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 -; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 -; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41700000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s4, s4, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_add_i32 s14, s0, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 2 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v0, v10, s[0:1] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 0 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 7 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v1, v10, s[6:7] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 6 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 5 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 4 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v5, v10, s[12:13] +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41600000 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v3, v10, s[6:7] +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41500000 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v4, v10, s[8:9] +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2.0, v10, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1.0, v10, s[2:3] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 10 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v9, v10, s[10:11] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 9 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 8 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 15 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 offset:32 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 13 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v17, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_w_offset: @@ -2237,81 +2238,82 @@ entry: define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) { ; GENERIC-LABEL: insert_unsigned_base_plus_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0xb ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 -; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 -; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41700000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_and_b32 s4, s4, 0xffff -; GENERIC-NEXT: s_add_i32 s4, s4, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_and_b32 s0, s0, 0xffff +; GENERIC-NEXT: s_add_i32 s14, s0, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 2 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v0, v10, s[0:1] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 0 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 7 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v1, v10, s[6:7] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 6 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 5 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 4 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v5, v10, s[12:13] +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41600000 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v3, v10, s[6:7] +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41500000 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v4, v10, s[8:9] +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2.0, v10, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1.0, v10, s[2:3] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 10 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v9, v10, s[10:11] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 9 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 8 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 15 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 offset:32 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 13 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v17, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_unsigned_base_plus_offset: @@ -2592,81 +2594,82 @@ entry: define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) { ; GENERIC-LABEL: insert_signed_base_plus_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0xb ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 -; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 -; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41700000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_sext_i32_i16 s4, s4 -; GENERIC-NEXT: s_add_i32 s4, s4, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_sext_i32_i16 s0, s0 +; GENERIC-NEXT: s_add_i32 s14, s0, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 2 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v0, v10, s[0:1] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 0 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 7 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v1, v10, s[6:7] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 6 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 5 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 4 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v5, v10, s[12:13] +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41600000 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v3, v10, s[6:7] +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41500000 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v4, v10, s[8:9] +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2.0, v10, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1.0, v10, s[2:3] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 10 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v9, v10, s[10:11] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 9 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 8 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 15 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 offset:32 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 13 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v17, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_signed_base_plus_offset: @@ -2954,79 +2957,80 @@ entry: define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: insert_wo_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s14, s[4:5], 0xb ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 -; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 -; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41700000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 2 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v0, v10, s[0:1] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 0 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 7 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v1, v10, s[6:7] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 6 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 5 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 4 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s14, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v5, v10, s[12:13] +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41600000 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v3, v10, s[6:7] +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41500000 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v4, v10, s[8:9] +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2.0, v10, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1.0, v10, s[2:3] +; GENERIC-NEXT: s_cmp_eq_u32 s14, 10 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v9, v10, s[10:11] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 9 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 8 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 15 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0 offset:32 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 13 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s14, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v17, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_wo_offset: @@ -3593,8 +3597,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 @@ -3612,8 +3616,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 @@ -3629,6 +3631,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off @@ -5646,116 +5650,116 @@ bb2: define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { ; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0xd +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s26, 0 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, 0 ; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 -; GENERIC-NEXT: s_mov_b32 s27, s3 +; GENERIC-NEXT: s_mov_b32 s3, s7 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[0:3], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: v_mov_b32 v1, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v10, s22 -; GENERIC-NEXT: v_mov_b32_e32 v11, s23 -; GENERIC-NEXT: v_mov_b32_e32 v15, s16 -; GENERIC-NEXT: v_mov_b32_e32 v2, s18 -; GENERIC-NEXT: v_mov_b32_e32 v3, s19 -; GENERIC-NEXT: v_mov_b32_e32 v4, s12 -; GENERIC-NEXT: v_mov_b32_e32 v5, s13 -; GENERIC-NEXT: v_mov_b32_e32 v6, s14 -; GENERIC-NEXT: v_mov_b32_e32 v7, s15 -; GENERIC-NEXT: v_mov_b32_e32 v8, s8 -; GENERIC-NEXT: v_mov_b32_e32 v9, s9 -; GENERIC-NEXT: v_mov_b32_e32 v12, s10 -; GENERIC-NEXT: v_mov_b32_e32 v13, s11 -; GENERIC-NEXT: v_add_i32_e32 v18, vcc, 1, v14 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GENERIC-NEXT: v_mov_b32_e32 v3, s23 +; GENERIC-NEXT: v_mov_b32_e32 v4, s19 +; GENERIC-NEXT: v_mov_b32_e32 v5, s15 +; GENERIC-NEXT: v_mov_b32_e32 v7, s10 +; GENERIC-NEXT: v_mov_b32_e32 v6, s11 +; GENERIC-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v5, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v18, 63, v3, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v15 +; GENERIC-NEXT: v_mov_b32_e32 v4, s9 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v3, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v15 +; GENERIC-NEXT: v_mov_b32_e32 v7, s8 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v3, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GENERIC-NEXT: v_mov_b32_e32 v7, s14 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v15 +; GENERIC-NEXT: v_mov_b32_e32 v8, s13 ; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; GENERIC-NEXT: v_mov_b32_e32 v16, s17 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v15 +; GENERIC-NEXT: v_mov_b32_e32 v11, s12 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v7, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v15 +; GENERIC-NEXT: v_mov_b32_e32 v11, s18 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v15 +; GENERIC-NEXT: v_mov_b32_e32 v12, s17 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v11, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v12, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v15 +; GENERIC-NEXT: v_mov_b32_e32 v16, s16 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v11, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 ; GENERIC-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v15 +; GENERIC-NEXT: v_mov_b32_e32 v16, s20 ; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; GENERIC-NEXT: v_mov_b32_e32 v19, s20 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; GENERIC-NEXT: v_mov_b32_e32 v15, s21 -; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; GENERIC-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s2, -1 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; GENERIC-NEXT: v_mov_b32_e32 v19, s21 +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v2 +; GENERIC-NEXT: v_mov_b32_e32 v17, s22 +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[2:3], 14, v2 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v17, v1, s[2:3] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[2:3], 14, v15 +; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v2, s[2:3] +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s6, -1 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v16, v1, vcc +; GENERIC-NEXT: v_cndmask_b32_e64 v16, v19, v1, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v16, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v15 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v2, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:32 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 +; GENERIC-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GENERIC-NEXT: s_cbranch_execz .LBB17_2 ; GENERIC-NEXT: ; %bb.1: ; %bb1 -; GENERIC-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GENERIC-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: .LBB17_2: ; %bb2 ; GENERIC-NEXT: s_endpgm @@ -6187,117 +6191,116 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; ; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s10, 0 -; SI-MOVREL-NEXT: s_mov_b32 s11, s3 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[0:3], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: v_mov_b32 v1, 62 ; SI-MOVREL-NEXT: ;;#ASMEND -; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s22 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s23 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s16 -; SI-MOVREL-NEXT: v_add_i32_e32 v18, vcc, 1, v14 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s23 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s21 +; SI-MOVREL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v5, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v4, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v18, 63, v3, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s9 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v3, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s8 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v3, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s14 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v15 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s17 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v15 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[2:3], 14, v2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s20 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s21 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s20 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v1, s[2:3] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[2:3], 14, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v2, s[2:3] +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v16, v1, vcc +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v16, v19, v1, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v16, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v15 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v2, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:32 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2 ; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 -; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: .LBB17_2: ; %bb2 ; SI-MOVREL-NEXT: s_endpgm @@ -6311,121 +6314,121 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: flat_load_dword v14, v[1:2] glc +; VI-NEXT: flat_load_dword v5, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s19 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: v_mov_b32 v1, 62 +; VI-NEXT: v_mov_b32 v6, 62 ; VI-NEXT: ;;#ASMEND -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v12, s10 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s22 -; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 1, v14 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; VI-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; VI-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; VI-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; VI-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; VI-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; VI-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; VI-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 -; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 -; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 -; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; VI-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; VI-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; VI-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; VI-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; VI-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; VI-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; VI-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; VI-NEXT: v_mov_b32_e32 v16, s17 -; VI-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 -; VI-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 -; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; VI-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; VI-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; VI-NEXT: v_mov_b32_e32 v15, s21 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; VI-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; VI-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; VI-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s17 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v19, s20 -; VI-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; VI-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; VI-NEXT: v_mov_b32_e32 v16, s16 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v21, s20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v5 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v15 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v15 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v15 +; VI-NEXT: v_cndmask_b32_e32 v18, 63, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v15 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v15 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_cndmask_b32_e32 v1, 63, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v5 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v15 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v5 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v15 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v7, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v5 +; VI-NEXT: v_cndmask_b32_e32 v7, v11, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v15 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v5 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v15 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v11, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v5 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v15 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v11, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v5 +; VI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v15 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v15 +; VI-NEXT: v_cndmask_b32_e32 v17, 63, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v19, v6, vcc +; VI-NEXT: v_mov_b32_e32 v20, s3 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v15 +; VI-NEXT: v_mov_b32_e32 v19, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v5 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_cndmask_b32_e32 v5, v21, v6, vcc +; VI-NEXT: v_mov_b32_e32 v22, s3 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v15 +; VI-NEXT: v_mov_b32_e32 v21, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v5, vcc ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, s3 -; VI-NEXT: v_mov_b32_e32 v10, s2 -; VI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; VI-NEXT: flat_store_dwordx4 v[21:22], v[11:14] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s1 -; VI-NEXT: v_mov_b32_e32 v6, s0 -; VI-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v12, s3 +; VI-NEXT: v_mov_b32_e32 v11, s2 +; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, s1 +; VI-NEXT: v_mov_b32_e32 v7, s0 +; VI-NEXT: flat_store_dwordx4 v[7:8], v[1:4] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; VI-NEXT: s_cbranch_execz .LBB17_2 ; VI-NEXT: ; %bb.1: ; %bb1 -; VI-NEXT: flat_store_dword v[0:1], v1 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: .LBB17_2: ; %bb2 ; VI-NEXT: s_endpgm @@ -6435,104 +6438,104 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dword v14, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: global_load_dword v2, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: ;;#ASMSTART ; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 ; GFX9-IDXMODE-NEXT: ;;#ASMEND -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s22 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s23 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v15, 1, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; GFX9-IDXMODE-NEXT: v_add_u32_e32 v18, 1, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v5, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v4, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s14 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v15 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s17 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v12, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s22 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s21 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v16, vcc +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v2 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, v16, v1, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v16, 63, v2, s[0:1] ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v19, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v15 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v2, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v20, v[15:18], s[0:1] offset:48 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v20, v[11:14], s[0:1] offset:32 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v20, v[7:10], s[0:1] offset:16 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v20, v[3:6], s[0:1] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 @@ -6567,134 +6570,136 @@ bb2: define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { ; GENERIC-LABEL: insert_w_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s24, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s31, 0xf000 -; GENERIC-NEXT: s_mov_b32 s30, -1 -; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41500000 -; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41880000 -; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 -; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 -; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000 -; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000 -; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000 -; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GENERIC-NEXT: s_load_dword s33, s[4:5], 0xb +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s25, s24, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 12 +; GENERIC-NEXT: s_add_i32 s30, s33, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 13 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 13 ; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 14 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 14 ; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 15 -; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 8 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 15 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 8 ; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 9 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 9 ; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 10 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 10 ; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 11 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 11 ; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 4 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 5 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 4 ; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 6 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 5 ; GENERIC-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 7 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 6 ; GENERIC-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 0 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 7 ; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, 2.0, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 2 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 3 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v16, 4.0, v0, s[22:23] -; GENERIC-NEXT: s_add_i32 s26, s24, 2 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 3 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 2 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 0 ; GENERIC-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 1 +; GENERIC-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 2 +; GENERIC-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s30, 3 +; GENERIC-NEXT: s_cselect_b64 s[30:31], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v19, 4.0, v4, s[30:31] +; GENERIC-NEXT: s_add_i32 s33, s33, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 3 +; GENERIC-NEXT: s_cselect_b64 s[30:31], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, v4, v19, s[30:31] +; GENERIC-NEXT: v_cndmask_b32_e64 v18, v1, v4, s[28:29] +; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s31, 0xf000 +; GENERIC-NEXT: s_mov_b32 s30, -1 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, 1.0, v4, s[24:25] +; GENERIC-NEXT: v_cndmask_b32_e64 v17, 2.0, v4, s[26:27] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[16:19], off, s[28:31], 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 2 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v4, v18, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 1 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, v4, v17, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 0 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 7 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[22:23] -; GENERIC-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[24:25] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 1 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, v0, v14, s[22:23] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v19, v0, v4, s[22:23] ; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v13, v0, v13, s[22:23] -; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[16:17] -; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[18:19] -; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[20:21] -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 7 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 6 -; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v20, v4, v19, s[22:23] +; GENERIC-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[4:5] +; GENERIC-NEXT: v_cndmask_b32_e64 v16, v13, v4, s[16:17] +; GENERIC-NEXT: v_cndmask_b32_e64 v17, v14, v4, s[18:19] +; GENERIC-NEXT: v_cndmask_b32_e64 v18, v15, v4, s[20:21] +; GENERIC-NEXT: buffer_store_dwordx4 v[16:19], off, s[28:31], 0 offset:16 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 6 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e64 v12, v0, v12, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e64 v11, v0, v11, s[16:17] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 5 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[14:15] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 4 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GENERIC-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] -; GENERIC-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[2:3] -; GENERIC-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] -; GENERIC-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[6:7] -; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48 -; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] -; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[10:11] -; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[12:13] -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 11 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_waitcnt expcnt(1) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 10 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 9 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64 +; GENERIC-NEXT: v_cndmask_b32_e64 v19, v4, v18, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 5 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v18, v4, v17, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 4 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 11 +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v4, s[12:13] +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, v4, v12, s[12:13] +; GENERIC-NEXT: v_cndmask_b32_e64 v17, v4, v16, s[4:5] +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v4, s[6:7] +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v4, s[8:9] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v4, s[10:11] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 10 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 9 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 8 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s33, 15 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[14:15] +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, v4, v8, s[10:11] +; GENERIC-NEXT: buffer_store_dwordx4 v[17:20], off, s[28:31], 0 offset:80 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[2:3] +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:32 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v4, v11, s[4:5] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v4, v10, s[6:7] +; GENERIC-NEXT: s_cmp_lg_u32 s33, 14 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v4, v9, s[8:9] +; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[28:31], 0 offset:96 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v13, v4, v7, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s33, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, v4, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s33, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[11:14], off, s[28:31], 0 offset:112 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[28:31], 0 offset:64 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: @@ -6889,7 +6894,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 ; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6905,16 +6909,17 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 ; SI-MOVREL-NEXT: s_endpgm ; @@ -6940,11 +6945,15 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 -; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v34, 0x41880000 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v34 ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2 -; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6961,45 +6970,39 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 -; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 -; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v34 +; VI-MOVREL-NEXT: v_mov_b32_e32 v35, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v34, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[34:35], v[8:11] ; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 -; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 -; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 ; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s4 ; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x50 +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s4 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[6:7], v[20:23] +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s3 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: flat_store_dwordx4 v[6:7], v[16:19] ; VI-MOVREL-NEXT: s_endpgm ; ; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block: @@ -7009,9 +7012,10 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; VI-IDXMODE-NEXT: s_add_i32 s3, s2, 1 +; VI-IDXMODE-NEXT: s_add_i32 s4, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -7024,12 +7028,16 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v34, 0x41880000 ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v34 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 -; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7045,49 +7053,42 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 -; VI-IDXMODE-NEXT: s_set_gpr_idx_off -; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 -; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v34 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v35, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v34, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[34:35], v[8:11] ; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 -; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 -; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 ; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s4 ; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x50 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s4 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[6:7], v[20:23] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s3 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[6:7], v[16:19] ; VI-IDXMODE-NEXT: s_endpgm ; ; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block: @@ -7117,7 +7118,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7133,18 +7133,19 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v33, 0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[28:31], s[0:1] offset:112 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[24:27], s[0:1] offset:96 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[20:23], s[0:1] offset:80 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v33, v[16:19], s[0:1] offset:64 ; GFX9-IDXMODE-NEXT: s_endpgm entry: %add1 = add i32 %in, 1 @@ -8620,85 +8621,86 @@ entry: define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind { ; GENERIC-LABEL: insertelement_v16f32_or_index: ; GENERIC: ; %bb.0: -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0x29 -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 -; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40a00000 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0x29 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_lshl_b32 s4, s4, 2 -; GENERIC-NEXT: v_mov_b32_e32 v0, s11 -; GENERIC-NEXT: v_mov_b32_e32 v1, s10 -; GENERIC-NEXT: v_mov_b32_e32 v4, s9 -; GENERIC-NEXT: v_mov_b32_e32 v5, s8 -; GENERIC-NEXT: v_mov_b32_e32 v6, s15 -; GENERIC-NEXT: v_mov_b32_e32 v8, s14 -; GENERIC-NEXT: v_mov_b32_e32 v9, s13 +; GENERIC-NEXT: s_lshl_b32 s0, s0, 2 +; GENERIC-NEXT: v_mov_b32_e32 v1, s11 +; GENERIC-NEXT: v_mov_b32_e32 v2, s10 +; GENERIC-NEXT: v_mov_b32_e32 v5, s9 +; GENERIC-NEXT: v_mov_b32_e32 v6, s8 +; GENERIC-NEXT: v_mov_b32_e32 v3, s15 +; GENERIC-NEXT: v_mov_b32_e32 v7, s14 +; GENERIC-NEXT: v_mov_b32_e32 v10, s13 ; GENERIC-NEXT: v_mov_b32_e32 v11, s12 -; GENERIC-NEXT: v_mov_b32_e32 v12, s19 -; GENERIC-NEXT: v_mov_b32_e32 v13, s18 -; GENERIC-NEXT: v_mov_b32_e32 v14, s17 -; GENERIC-NEXT: v_mov_b32_e32 v15, s16 -; GENERIC-NEXT: v_mov_b32_e32 v16, s23 -; GENERIC-NEXT: v_mov_b32_e32 v17, s22 -; GENERIC-NEXT: v_mov_b32_e32 v18, s21 -; GENERIC-NEXT: v_mov_b32_e32 v19, s20 -; GENERIC-NEXT: s_or_b32 s4, s4, 1 -; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 +; GENERIC-NEXT: v_mov_b32_e32 v9, s19 +; GENERIC-NEXT: v_mov_b32_e32 v12, s18 +; GENERIC-NEXT: v_mov_b32_e32 v13, s17 +; GENERIC-NEXT: s_or_b32 s14, s0, 1 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 2 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v5, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v6, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v8, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 0 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 7 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v0, v3, s[6:7] +; GENERIC-NEXT: s_cmp_lg_u32 s14, 6 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 5 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 4 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[12:13] +; GENERIC-NEXT: v_mov_b32_e32 v14, s16 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GENERIC-NEXT: v_mov_b32_e32 v15, s23 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v0, v5, s[0:1] +; GENERIC-NEXT: v_mov_b32_e32 v16, s22 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, v0, v6, s[2:3] +; GENERIC-NEXT: v_mov_b32_e32 v17, s21 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v0, v7, s[6:7] +; GENERIC-NEXT: v_mov_b32_e32 v18, s20 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v0, v10, s[8:9] +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_cmp_lg_u32 s14, 10 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v0, v11, s[10:11] +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 9 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 8 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v15, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v14, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 15 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v10, v16, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v0, v15, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v17, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v0, v16, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 13 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v18, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v17, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s14, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v18, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insertelement_v16f32_or_index: @@ -9810,15 +9812,15 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; VI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1 ; VI-MOVREL-NEXT: ; %bb.2: ; VI-MOVREL-NEXT: s_mov_b64 exec, s[0:1] -; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-MOVREL-NEXT: v_add_u32_e64 v3, s[0:1], 48, v0 ; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-MOVREL-NEXT: v_addc_u32_e64 v4, s[0:1], 0, v1, s[0:1] +; VI-MOVREL-NEXT: v_add_u32_e64 v21, s[0:1], 16, v0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[3:4], v[17:20] ; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: v_addc_u32_e64 v22, vcc, 0, v1, s[0:1] ; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[13:16] -; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-MOVREL-NEXT: flat_store_dwordx4 v[21:22], v[9:12] ; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; VI-MOVREL-NEXT: s_endpgm ; @@ -9855,15 +9857,15 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1 ; VI-IDXMODE-NEXT: ; %bb.2: ; VI-IDXMODE-NEXT: s_mov_b64 exec, s[0:1] -; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-IDXMODE-NEXT: v_add_u32_e64 v3, s[0:1], 48, v0 ; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e64 v4, s[0:1], 0, v1, s[0:1] +; VI-IDXMODE-NEXT: v_add_u32_e64 v21, s[0:1], 16, v0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[3:4], v[17:20] ; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: v_addc_u32_e64 v22, vcc, 0, v1, s[0:1] ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[13:16] -; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[21:22], v[9:12] ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; VI-IDXMODE-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..d3ce1165df1f2 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -195,90 +195,84 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[4:5], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v33, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NEXT: v_mov_b32_e32 v8, s44 -; GCN-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NEXT: v_mov_b32_e32 v12, s48 -; GCN-NEXT: v_mov_b32_e32 v13, s49 -; GCN-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: v_mov_b32_e32 v20, s12 -; GCN-NEXT: v_mov_b32_e32 v21, s13 -; GCN-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NEXT: v_mov_b32_e32 v23, s15 -; GCN-NEXT: v_mov_b32_e32 v24, s16 -; GCN-NEXT: v_mov_b32_e32 v25, s17 -; GCN-NEXT: v_mov_b32_e32 v26, s18 -; GCN-NEXT: v_mov_b32_e32 v27, s19 -; GCN-NEXT: v_mov_b32_e32 v28, s20 -; GCN-NEXT: v_mov_b32_e32 v29, s21 -; GCN-NEXT: v_mov_b32_e32 v30, s22 -; GCN-NEXT: v_mov_b32_e32 v31, s23 ; GCN-NEXT: v_mov_b32_e32 v32, s2 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x124 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v35, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NEXT: v_mov_b32_e32 v14, s22 +; GCN-NEXT: v_mov_b32_e32 v15, s23 +; GCN-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NEXT: v_mov_b32_e32 v18, s38 +; GCN-NEXT: v_mov_b32_e32 v19, s39 +; GCN-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NEXT: v_mov_b32_e32 v21, s41 +; GCN-NEXT: v_mov_b32_e32 v22, s42 +; GCN-NEXT: v_mov_b32_e32 v23, s43 +; GCN-NEXT: v_mov_b32_e32 v24, s44 +; GCN-NEXT: v_mov_b32_e32 v25, s45 +; GCN-NEXT: v_mov_b32_e32 v26, s46 +; GCN-NEXT: v_mov_b32_e32 v27, s47 +; GCN-NEXT: v_mov_b32_e32 v28, s48 +; GCN-NEXT: v_mov_b32_e32 v29, s49 +; GCN-NEXT: v_mov_b32_e32 v30, s50 +; GCN-NEXT: v_mov_b32_e32 v31, s51 +; GCN-NEXT: v_mov_b32_e32 v34, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: s_add_u32 s2, s0, 64 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[28:29], v[20:23] +; GCN-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel @@ -756,93 +750,85 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 m0, s0, 1 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x124 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NEXT: v_mov_b32_e32 v8, s44 -; GCN-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NEXT: v_mov_b32_e32 v12, s48 -; GCN-NEXT: v_mov_b32_e32 v13, s49 -; GCN-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: v_mov_b32_e32 v20, s12 -; GCN-NEXT: v_mov_b32_e32 v21, s13 -; GCN-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NEXT: v_mov_b32_e32 v23, s15 -; GCN-NEXT: v_mov_b32_e32 v24, s16 -; GCN-NEXT: v_mov_b32_e32 v25, s17 -; GCN-NEXT: v_mov_b32_e32 v26, s18 -; GCN-NEXT: v_mov_b32_e32 v27, s19 -; GCN-NEXT: v_mov_b32_e32 v28, s20 -; GCN-NEXT: v_mov_b32_e32 v29, s21 -; GCN-NEXT: v_mov_b32_e32 v30, s22 -; GCN-NEXT: v_mov_b32_e32 v31, s23 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 +; GCN-NEXT: v_mov_b32_e32 v34, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: v_mov_b32_e32 v33, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NEXT: v_mov_b32_e32 v14, s22 +; GCN-NEXT: v_mov_b32_e32 v15, s23 +; GCN-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NEXT: v_mov_b32_e32 v18, s38 +; GCN-NEXT: v_mov_b32_e32 v19, s39 +; GCN-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NEXT: v_mov_b32_e32 v21, s41 +; GCN-NEXT: v_mov_b32_e32 v22, s42 +; GCN-NEXT: v_mov_b32_e32 v23, s43 +; GCN-NEXT: v_mov_b32_e32 v24, s44 +; GCN-NEXT: v_mov_b32_e32 v25, s45 +; GCN-NEXT: v_mov_b32_e32 v26, s46 +; GCN-NEXT: v_mov_b32_e32 v27, s47 +; GCN-NEXT: v_mov_b32_e32 v28, s48 +; GCN-NEXT: v_mov_b32_e32 v29, s49 +; GCN-NEXT: v_mov_b32_e32 v30, s50 +; GCN-NEXT: v_mov_b32_e32 v31, s51 ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_movreld_b32_e32 v0, 0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_movreld_b32_e32 v1, v34 +; GCN-NEXT: v_mov_b32_e32 v35, s3 +; GCN-NEXT: v_mov_b32_e32 v34, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: s_add_u32 s2, s0, 64 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[28:29], v[20:23] +; GCN-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel @@ -857,19 +843,24 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x104 ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0xe4 -; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v34, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x124 ; GCN-NEXT: v_mov_b32_e32 v24, s0 -; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 ; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v26, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s8, 1 +; GCN-NEXT: v_mov_b32_e32 v27, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 m0, s0, 1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 ; GCN-NEXT: v_mov_b32_e32 v7, s15 @@ -889,54 +880,44 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v21, s29 ; GCN-NEXT: v_mov_b32_e32 v22, s30 ; GCN-NEXT: v_mov_b32_e32 v23, s31 -; GCN-NEXT: v_mov_b32_e32 v26, s2 -; GCN-NEXT: v_mov_b32_e32 v27, s3 -; GCN-NEXT: v_mov_b32_e32 v28, s6 ; GCN-NEXT: v_mov_b32_e32 v29, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-NEXT: v_movreld_b32_e32 v1, v32 +; GCN-NEXT: v_mov_b32_e32 v32, s2 +; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_movreld_b32_e32 v1, v34 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v31, s3 ; GCN-NEXT: v_mov_b32_e32 v30, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29] +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dwordx2 v[4:5], v[28:29] ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-NEXT: s_endpgm entry: @@ -1845,70 +1826,70 @@ entry: define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt_vec: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 3, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 4, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 5, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], 6, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[10:11], 7, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[12:13], 8, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[14:15], 9, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[16:17], 10, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[18:19], 11, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[20:21], 12, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[22:23], 13, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[24:25], 14, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[26:27], 15, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[28:29], 16, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[30:31], 17, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[34:35], 18, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[36:37], 19, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[38:39], 20, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[40:41], 21, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[42:43], 22, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[44:45], 23, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[46:47], 24, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[48:49], 25, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[50:51], 26, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[52:53], 27, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[54:55], 28, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[56:57], 29, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[58:59], 30, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[60:61], 31, v32 -; GCN-NEXT: v_cmp_ne_u32_e64 s[62:63], 0, v32 -; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[62:63] ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v5, 1.0, v5, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, v6, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v7, 1.0, v7, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v8, 1.0, v8, s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v9, 1.0, v9, s[14:15] -; GCN-NEXT: v_cndmask_b32_e64 v10, 1.0, v10, s[16:17] -; GCN-NEXT: v_cndmask_b32_e64 v11, 1.0, v11, s[18:19] -; GCN-NEXT: v_cndmask_b32_e64 v12, 1.0, v12, s[20:21] -; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, v13, s[22:23] -; GCN-NEXT: v_cndmask_b32_e64 v14, 1.0, v14, s[24:25] -; GCN-NEXT: v_cndmask_b32_e64 v15, 1.0, v15, s[26:27] -; GCN-NEXT: v_cndmask_b32_e64 v16, 1.0, v16, s[28:29] -; GCN-NEXT: v_cndmask_b32_e64 v17, 1.0, v17, s[30:31] -; GCN-NEXT: v_cndmask_b32_e64 v18, 1.0, v18, s[34:35] -; GCN-NEXT: v_cndmask_b32_e64 v19, 1.0, v19, s[36:37] -; GCN-NEXT: v_cndmask_b32_e64 v20, 1.0, v20, s[38:39] -; GCN-NEXT: v_cndmask_b32_e64 v21, 1.0, v21, s[40:41] -; GCN-NEXT: v_cndmask_b32_e64 v22, 1.0, v22, s[42:43] -; GCN-NEXT: v_cndmask_b32_e64 v23, 1.0, v23, s[44:45] -; GCN-NEXT: v_cndmask_b32_e64 v24, 1.0, v24, s[46:47] -; GCN-NEXT: v_cndmask_b32_e64 v25, 1.0, v25, s[48:49] -; GCN-NEXT: v_cndmask_b32_e64 v26, 1.0, v26, s[50:51] -; GCN-NEXT: v_cndmask_b32_e64 v27, 1.0, v27, s[52:53] -; GCN-NEXT: v_cndmask_b32_e64 v28, 1.0, v28, s[54:55] -; GCN-NEXT: v_cndmask_b32_e64 v29, 1.0, v29, s[56:57] -; GCN-NEXT: v_cndmask_b32_e64 v30, 1.0, v30, s[58:59] -; GCN-NEXT: v_cndmask_b32_e64 v31, 1.0, v31, s[60:61] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v32 +; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 3, v32 +; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 4, v32 +; GCN-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 5, v32 +; GCN-NEXT: v_cndmask_b32_e32 v5, 1.0, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 6, v32 +; GCN-NEXT: v_cndmask_b32_e32 v6, 1.0, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 7, v32 +; GCN-NEXT: v_cndmask_b32_e32 v7, 1.0, v7, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 8, v32 +; GCN-NEXT: v_cndmask_b32_e32 v8, 1.0, v8, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v32 +; GCN-NEXT: v_cndmask_b32_e32 v9, 1.0, v9, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 10, v32 +; GCN-NEXT: v_cndmask_b32_e32 v10, 1.0, v10, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 11, v32 +; GCN-NEXT: v_cndmask_b32_e32 v11, 1.0, v11, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 12, v32 +; GCN-NEXT: v_cndmask_b32_e32 v12, 1.0, v12, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 13, v32 +; GCN-NEXT: v_cndmask_b32_e32 v13, 1.0, v13, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 14, v32 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1.0, v14, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 15, v32 +; GCN-NEXT: v_cndmask_b32_e32 v15, 1.0, v15, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v32 +; GCN-NEXT: v_cndmask_b32_e32 v16, 1.0, v16, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 17, v32 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1.0, v17, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 18, v32 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1.0, v18, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 19, v32 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1.0, v19, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 20, v32 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1.0, v20, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 21, v32 +; GCN-NEXT: v_cndmask_b32_e32 v21, 1.0, v21, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 22, v32 +; GCN-NEXT: v_cndmask_b32_e32 v22, 1.0, v22, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 23, v32 +; GCN-NEXT: v_cndmask_b32_e32 v23, 1.0, v23, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 24, v32 +; GCN-NEXT: v_cndmask_b32_e32 v24, 1.0, v24, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 25, v32 +; GCN-NEXT: v_cndmask_b32_e32 v25, 1.0, v25, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v32 +; GCN-NEXT: v_cndmask_b32_e32 v26, 1.0, v26, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 27, v32 +; GCN-NEXT: v_cndmask_b32_e32 v27, 1.0, v27, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 28, v32 +; GCN-NEXT: v_cndmask_b32_e32 v28, 1.0, v28, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 29, v32 +; GCN-NEXT: v_cndmask_b32_e32 v29, 1.0, v29, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 30, v32 +; GCN-NEXT: v_cndmask_b32_e32 v30, 1.0, v30, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 31, v32 +; GCN-NEXT: v_cndmask_b32_e32 v31, 1.0, v31, vcc ; GCN-NEXT: ; return to shader part epilog entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index b81fdd36530da..4d87ae3a6ba63 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1597,198 +1597,198 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x4 +; SI-NEXT: s_mov_b32 s23, 0x100f000 +; SI-NEXT: s_mov_b32 s22, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[14:15] +; SI-NEXT: s_mov_b64 s[20:21], s[18:19] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s7, 6 -; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[20:23], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[20:23], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s25, 6 +; SI-NEXT: v_mov_b32_e32 v6, s24 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 7 -; SI-NEXT: s_mov_b64 s[14:15], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s25, 7 +; SI-NEXT: s_mov_b64 s[18:19], s[22:23] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 4 +; SI-NEXT: s_cmp_eq_u32 s25, 4 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_cmp_eq_u32 s25, 5 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 2 +; SI-NEXT: s_cmp_eq_u32 s25, 2 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cmp_eq_u32 s25, 3 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s25, 0 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s25, 1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s25, 14 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s25, 15 ; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s25, 12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_cselect_b64 s[14:15], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11] ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3] -; SI-NEXT: s_cmp_eq_u32 s7, 1 +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_cndmask_b32_e64 v11, v13, v6, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v12, v15, v6, s[12:13] +; SI-NEXT: s_cmp_eq_u32 s25, 13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 14 -; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] -; SI-NEXT: v_or_b32_e32 v8, v8, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 12 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 10 +; SI-NEXT: s_cmp_eq_u32 s25, 10 +; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[6:7] ; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_cndmask_b32_e64 v11, v14, v6, s[8:9] ; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 11 +; SI-NEXT: s_cmp_eq_u32 s25, 11 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s25, 8 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 9 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: s_cmp_eq_u32 s25, 9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[14:15] +; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[16:19], 0 addr64 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[16:19], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 +; VI-NEXT: v_mov_b32_e32 v0, s19 +; VI-NEXT: v_add_u32_e32 v4, vcc, s18, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: s_cmp_eq_u32 s21, 14 +; VI-NEXT: v_mov_b32_e32 v12, s20 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_cmp_eq_u32 s21, 15 +; VI-NEXT: v_mov_b32_e32 v9, s17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v10, v3, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cmp_eq_u32 s21, 12 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: s_cmp_eq_u32 s21, 13 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cmp_eq_u32 s21, 10 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: s_cmp_eq_u32 s21, 11 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 8 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v10, v13, v12, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 6 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, s16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v11, v14, v12, s[8:9] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 4 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v13, v15, v12, s[12:13] +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: s_cmp_eq_u32 s21, 5 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s21, 2 +; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: s_cmp_eq_u32 s21, 3 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 0 +; VI-NEXT: v_cndmask_b32_e64 v14, v17, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[14:15] +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -1797,50 +1797,49 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[28:31], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[26:27], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX900-NEXT: s_mov_b32 s30, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] -; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s29, 6 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[30:31] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[30:31] offset:16 +; GFX900-NEXT: s_cmp_eq_u32 s27, 6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 7 +; GFX900-NEXT: s_cmp_eq_u32 s27, 7 ; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 4 +; GFX900-NEXT: s_cmp_eq_u32 s27, 4 ; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 5 +; GFX900-NEXT: s_cmp_eq_u32 s27, 5 ; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 2 +; GFX900-NEXT: s_cmp_eq_u32 s27, 2 ; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 3 +; GFX900-NEXT: s_cmp_eq_u32 s27, 3 ; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 0 +; GFX900-NEXT: s_cmp_eq_u32 s27, 0 ; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 1 +; GFX900-NEXT: s_cmp_eq_u32 s27, 1 ; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 14 -; GFX900-NEXT: v_mov_b32_e32 v9, s28 +; GFX900-NEXT: s_cmp_eq_u32 s27, 14 ; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 15 +; GFX900-NEXT: s_cmp_eq_u32 s27, 15 ; GFX900-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 12 +; GFX900-NEXT: s_cmp_eq_u32 s27, 12 ; GFX900-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 13 +; GFX900-NEXT: s_cmp_eq_u32 s27, 13 ; GFX900-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 10 +; GFX900-NEXT: s_cmp_eq_u32 s27, 10 +; GFX900-NEXT: v_mov_b32_e32 v9, s26 ; GFX900-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 11 +; GFX900-NEXT: s_cmp_eq_u32 s27, 11 ; GFX900-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 8 -; GFX900-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s29, 9 -; GFX900-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s27, 8 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX900-NEXT: s_mov_b64 vcc, s[0:1] ; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v17, v4, v9, vcc ; GFX900-NEXT: s_mov_b64 vcc, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] ; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1851,28 +1850,29 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] ; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: s_mov_b64 vcc, s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] +; GFX900-NEXT: s_cmp_eq_u32 s27, 9 ; GFX900-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: s_mov_b64 vcc, s[20:21] ; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX900-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: s_mov_b64 vcc, s[24:25] -; GFX900-NEXT: v_perm_b32 v3, v3, v10, s30 -; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] ; GFX900-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: s_mov_b64 vcc, s[28:29] -; GFX900-NEXT: v_perm_b32 v2, v2, v11, s30 -; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX900-NEXT: s_mov_b64 vcc, s[0:1] +; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_perm_b32 v7, v7, v14, s30 -; GFX900-NEXT: v_perm_b32 v6, v6, v15, s30 -; GFX900-NEXT: v_perm_b32 v5, v5, v10, s30 -; GFX900-NEXT: v_perm_b32 v4, v4, v11, s30 -; GFX900-NEXT: v_perm_b32 v1, v1, v12, s30 -; GFX900-NEXT: v_perm_b32 v0, v0, v13, s30 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] +; GFX900-NEXT: v_perm_b32 v7, v7, v14, s2 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s2 +; GFX900-NEXT: v_perm_b32 v5, v5, v16, s2 +; GFX900-NEXT: v_perm_b32 v4, v4, v17, s2 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s2 +; GFX900-NEXT: v_perm_b32 v1, v1, v12, s2 +; GFX900-NEXT: v_perm_b32 v0, v0, v13, s2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[28:29] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[28:29] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v16bf16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 2585167a6a98e..3f2f9d246f63e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -3120,50 +3120,49 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[28:31], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[26:27], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX9-NEXT: s_mov_b32 s30, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s29, 6 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[30:31] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[30:31] offset:16 +; GFX9-NEXT: s_cmp_eq_u32 s27, 6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 7 +; GFX9-NEXT: s_cmp_eq_u32 s27, 7 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 4 +; GFX9-NEXT: s_cmp_eq_u32 s27, 4 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 5 +; GFX9-NEXT: s_cmp_eq_u32 s27, 5 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 2 +; GFX9-NEXT: s_cmp_eq_u32 s27, 2 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 3 +; GFX9-NEXT: s_cmp_eq_u32 s27, 3 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 0 +; GFX9-NEXT: s_cmp_eq_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 1 +; GFX9-NEXT: s_cmp_eq_u32 s27, 1 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 14 -; GFX9-NEXT: v_mov_b32_e32 v9, s28 +; GFX9-NEXT: s_cmp_eq_u32 s27, 14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 15 +; GFX9-NEXT: s_cmp_eq_u32 s27, 15 ; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 12 +; GFX9-NEXT: s_cmp_eq_u32 s27, 12 ; GFX9-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 13 +; GFX9-NEXT: s_cmp_eq_u32 s27, 13 ; GFX9-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 10 +; GFX9-NEXT: s_cmp_eq_u32 s27, 10 +; GFX9-NEXT: v_mov_b32_e32 v9, s26 ; GFX9-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 11 +; GFX9-NEXT: s_cmp_eq_u32 s27, 11 ; GFX9-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 8 -; GFX9-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s29, 9 -; GFX9-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s27, 8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX9-NEXT: s_mov_b64 vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v9, vcc ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3174,124 +3173,125 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] ; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b64 vcc, s[16:17] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] +; GFX9-NEXT: s_cmp_eq_u32 s27, 9 ; GFX9-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b64 vcc, s[20:21] ; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b64 vcc, s[24:25] -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s30 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] ; GFX9-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_mov_b64 vcc, s[28:29] -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s30 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX9-NEXT: s_mov_b64 vcc, s[0:1] +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v7, v7, v14, s30 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s30 -; GFX9-NEXT: v_perm_b32 v5, v5, v10, s30 -; GFX9-NEXT: v_perm_b32 v4, v4, v11, s30 -; GFX9-NEXT: v_perm_b32 v1, v1, v12, s30 -; GFX9-NEXT: v_perm_b32 v0, v0, v13, s30 -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s2 +; GFX9-NEXT: v_perm_b32 v6, v6, v15, s2 +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s2 +; GFX9-NEXT: v_perm_b32 v4, v4, v17, s2 +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v11, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v12, s2 +; GFX9-NEXT: v_perm_b32 v0, v0, v13, s2 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[28:29] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[28:29] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 +; VI-NEXT: v_mov_b32_e32 v0, s19 +; VI-NEXT: v_add_u32_e32 v4, vcc, s18, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: s_cmp_eq_u32 s21, 14 +; VI-NEXT: v_mov_b32_e32 v12, s20 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_cmp_eq_u32 s21, 15 +; VI-NEXT: v_mov_b32_e32 v9, s17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v10, v3, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cmp_eq_u32 s21, 12 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: s_cmp_eq_u32 s21, 13 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s21, 10 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: s_cmp_eq_u32 s21, 11 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 8 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v10, v13, v12, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 6 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, s16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v11, v14, v12, s[8:9] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 4 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v13, v15, v12, s[12:13] +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: s_cmp_eq_u32 s21, 5 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s21, 2 +; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: s_cmp_eq_u32 s21, 3 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 0 +; VI-NEXT: v_cndmask_b32_e64 v14, v17, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s21, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[14:15] +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -3300,135 +3300,134 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, s6, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] +; CI-NEXT: flat_load_dwordx4 v[6:9], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CI-NEXT: s_cmp_eq_u32 s5, 15 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; CI-NEXT: s_cmp_eq_u32 s9, 15 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 14 +; CI-NEXT: s_cmp_eq_u32 s9, 14 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 13 +; CI-NEXT: s_cmp_eq_u32 s9, 13 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 12 +; CI-NEXT: s_cmp_eq_u32 s9, 12 +; CI-NEXT: v_mov_b32_e32 v11, s5 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_or_b32_e32 v9, v9, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] +; CI-NEXT: s_cmp_eq_u32 s9, 11 +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v5, s[2:3] ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 10 -; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s9, 10 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v8, v8, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 -; CI-NEXT: s_cmp_eq_u32 s5, 9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; CI-NEXT: s_cmp_eq_u32 s9, 9 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: s_cmp_eq_u32 s9, 8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s9, 7 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s9, 6 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; CI-NEXT: v_or_b32_e32 v8, v8, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; CI-NEXT: v_cndmask_b32_e64 v13, v13, v5, s[2:3] +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; CI-NEXT: v_or_b32_e32 v7, v7, v10 +; CI-NEXT: v_add_i32_e64 v10, s[2:3], s4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s9, 5 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: s_cmp_eq_u32 s9, 4 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_or_b32_e32 v3, v3, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: s_cmp_eq_u32 s9, 3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 2 +; CI-NEXT: s_cmp_eq_u32 s9, 2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 1 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s9, 1 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s9, 0 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; CI-NEXT: v_or_b32_e32 v0, v0, v6 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_addc_u32_e64 v11, s[2:3], 0, v11, s[2:3] +; CI-NEXT: v_or_b32_e32 v1, v1, v5 +; CI-NEXT: v_or_b32_e32 v0, v0, v4 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; CI-NEXT: v_or_b32_e32 v6, v6, v12 +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v10 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; CI-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; CI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index a3b0a7768ca67..8b39852423741 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -315,10 +315,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[16:17] ; ; GFX90A-LABEL: tail_call_byval_align16: diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d4..648b458fd5586 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -6414,46 +6414,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v13, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v4, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, v2, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v14 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v4, v[0:1] +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v10 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v14, s[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v20, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v15, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v7, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v6, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v17, v[7:8] +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, v[4:5] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v9, v15, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v21, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v8, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v16, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[7:8] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6519,46 +6519,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v13, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v4, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, v2, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v14 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v4, v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v10 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v14, s[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v20, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v15, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v7, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v6, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v17, v[7:8] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, v[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v9, v15, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v21, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v8, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v16, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[7:8] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6616,46 +6616,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v0, v12 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, v8, v13, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v4, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v18, vcc, v2, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v9, v15, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v11, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, v14 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v4, v[0:1] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v10 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v21, vcc, 0, v14, s[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v20, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v15, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v7, v[3:4] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v6, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v17, v[7:8] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, v[4:5] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v9, v15, v[6:7] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v21, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v8, v17, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v16, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[7:8] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir index eaf669da83ead..cd1a0f394d4da 100644 --- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir @@ -11,26 +11,28 @@ body: | ; REG_ALLOC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: KILL killed renamable $vgpr4 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr2 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr0 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr3 - ; REG_ALLOC-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; REG_ALLOC-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; REG_ALLOC-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec ; REG_ALLOC-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REG_ALLOC-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; REG_ALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; REG_ALLOC-NEXT: S_BRANCH %bb.2 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.1: ; REG_ALLOC-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; REG_ALLOC-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -42,33 +44,33 @@ body: | ; REG_ALLOC-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; REG_ALLOC-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF ; REG_ALLOC-NEXT: S_BRANCH %bb.1 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.3: ; REG_ALLOC-NEXT: successors: %bb.5(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec ; REG_ALLOC-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: S_BRANCH %bb.5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.4: - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; REG_ALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; REG_ALLOC-NEXT: S_ENDPGM 0 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.5: ; REG_ALLOC-NEXT: successors: %bb.4(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: S_BRANCH %bb.4 @@ -78,26 +80,28 @@ body: | ; DEAD_INST_DEL-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr4 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr2 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr0 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr3 - ; DEAD_INST_DEL-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; DEAD_INST_DEL-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec ; DEAD_INST_DEL-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.2 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.1: ; DEAD_INST_DEL-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; DEAD_INST_DEL-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -109,33 +113,33 @@ body: | ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; DEAD_INST_DEL-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.1 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.3: ; DEAD_INST_DEL-NEXT: successors: %bb.5(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.4: - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: S_ENDPGM 0 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.5: ; DEAD_INST_DEL-NEXT: successors: %bb.4(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee49b93f..3548b508d9409 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -31,6 +31,80 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[32:35], v1 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:48 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[32:35] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:16400 ; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 ; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 ; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 @@ -39,83 +113,19 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 ; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 ; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 ; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 ; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 ; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 ; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 ; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll index 303ea50dc16cc..b4eff5a168d37 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_f32_32x32x2bf16: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: v_mov_b32_e32 v29, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -24,13 +24,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, s22 +; GFX908-NEXT: v_mov_b32_e32 v1, s23 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, s21 -; GFX908-NEXT: v_mov_b32_e32 v1, s22 -; GFX908-NEXT: v_mov_b32_e32 v2, s23 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v1 ; GFX908-NEXT: v_mov_b32_e32 v0, s24 ; GFX908-NEXT: v_mov_b32_e32 v1, s25 ; GFX908-NEXT: v_mov_b32_e32 v2, s26 @@ -71,7 +69,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v0, s10 ; GFX908-NEXT: v_mov_b32_e32 v1, s11 ; GFX908-NEXT: v_mov_b32_e32 v2, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v4, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, s21 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 @@ -80,7 +79,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v1, s14 ; GFX908-NEXT: v_mov_b32_e32 v2, s15 ; GFX908-NEXT: v_mov_b32_e32 v3, 1 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 @@ -94,57 +94,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a19 +; GFX908-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a4 +; GFX908-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; GFX908-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; GFX908-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; GFX908-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; GFX908-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; GFX908-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; GFX908-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x2bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 78be949baabac..b7f542b412da5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -34,13 +34,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 @@ -81,7 +79,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -90,7 +89,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -104,63 +104,48 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -170,13 +155,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 @@ -217,7 +200,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 -; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -226,7 +210,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -240,57 +225,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32: @@ -1198,7 +1168,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 @@ -1207,11 +1177,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 @@ -1256,8 +1224,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 @@ -1267,8 +1236,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 @@ -1282,63 +1252,48 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[36:37] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[36:37] offset:112 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[36:37] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[36:37] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[36:37] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[36:37] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[36:37] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[36:37] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 @@ -1347,11 +1302,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 @@ -1396,8 +1349,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 -; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 +; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 @@ -1407,8 +1361,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 @@ -1422,57 +1377,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[36:37] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[36:37] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[36:37] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[36:37] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[36:37] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[36:37] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[36:37] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[36:37] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x4f16: @@ -2463,7 +2403,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2473,13 +2413,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 @@ -2521,6 +2459,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2530,6 +2469,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2539,53 +2479,53 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[20:23], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] -; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[16:19], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[12:15], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[8:11], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[4:7], s[34:35] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -2595,13 +2535,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 @@ -2643,6 +2581,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 @@ -2652,6 +2591,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 @@ -2661,47 +2601,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[20:23], s[34:35] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] -; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a16 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[16:19], s[34:35] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[12:15], s[34:35] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[8:11], s[34:35] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[4:7], s[34:35] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_32x32x4i8: @@ -3571,6 +3511,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -3641,8 +3582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 -; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 @@ -3652,62 +3592,48 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -3778,8 +3704,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 -; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: s_nop 7 @@ -3789,57 +3714,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a19 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_forward_acc: @@ -4819,7 +4729,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 @@ -4828,58 +4738,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a23 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[0:1] offset:112 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[0:1] offset:96 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[0:1] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[0:1] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -4887,7 +4782,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 @@ -4896,47 +4791,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a23 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[0:1] offset:96 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[0:1] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[0:1] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -5355,7 +5246,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 @@ -5364,58 +5255,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a23 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[0:1] offset:112 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[0:1] offset:96 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[0:1] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[0:1] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: @@ -5455,7 +5331,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v29, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 @@ -5464,58 +5340,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a23 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[1:4], s[0:1] offset:96 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[5:8], s[0:1] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[9:12], s[0:1] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[13:16], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[17:20], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[21:24], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v29, v[25:28], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm: @@ -5969,36 +5830,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 -; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 @@ -6065,36 +5925,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 -; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index d1ba892d7f7e1..fbeb20f40ea40 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -984,10 +984,10 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 @@ -1017,8 +1017,8 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 @@ -1033,11 +1033,11 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v29 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[36:67] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -1070,9 +1070,9 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 @@ -1429,10 +1429,10 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 @@ -1462,8 +1462,8 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 @@ -1478,11 +1478,11 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v29 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[36:67] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -1515,9 +1515,9 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 ; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 ; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 5b877f5a2bbb7..46198b509c990 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -1199,49 +1199,66 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; GCN-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v4 -; GCN-NEXT: v_sub_f32_e32 v6, v4, v5 -; GCN-NEXT: v_fma_f32 v4, s0, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s0, v7 -; GCN-NEXT: v_add_f32_e32 v4, v6, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-NEXT: v_mul_f32_e32 v3, s0, v2 +; GCN-NEXT: v_rndne_f32_e32 v4, v3 +; GCN-NEXT: v_sub_f32_e32 v5, v3, v4 +; GCN-NEXT: v_fma_f32 v3, s0, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s0, v6 +; GCN-NEXT: v_add_f32_e32 v3, v5, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[96:99], v1 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; GCN-NEXT: v_mov_b32_e32 v9, 1.0 -; GCN-NEXT: v_ldexp_f32 v4, v4, v5 -; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_mov_b32_e32 v5, 1.0 +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; GCN-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: v_add_u32_e32 v0, s7, v0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 +; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: v_mul_f32_e32 v0, s1, v2 +; GCN-NEXT: v_rndne_f32_e32 v3, v0 +; GCN-NEXT: v_sub_f32_e32 v9, v0, v3 +; GCN-NEXT: v_fma_f32 v0, s1, v2, -v0 +; GCN-NEXT: v_fmac_f32_e32 v0, s1, v6 +; GCN-NEXT: v_add_f32_e32 v0, v9, v0 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; GCN-NEXT: v_add_f32_e32 v4, v12, v10 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 ; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 @@ -1249,104 +1266,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_ldexp_f32 v4, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 +; GCN-NEXT: v_ldexp_f32 v0, v0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GCN-NEXT: v_mul_f32_e32 v3, s2, v2 +; GCN-NEXT: v_rndne_f32_e32 v10, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 -; GCN-NEXT: v_add_f32_e32 v4, v12, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: v_ldexp_f32 v1, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 -; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 -; GCN-NEXT: v_add_f32_e32 v1, v1, v4 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; GCN-NEXT: v_ldexp_f32 v1, v1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 -; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; GCN-NEXT: v_rndne_f32_e32 v1, v4 -; GCN-NEXT: v_sub_f32_e32 v10, v4, v1 -; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7 -; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v0, a[0:31] +; GCN-NEXT: v_sub_f32_e32 v11, v3, v10 +; GCN-NEXT: v_fma_f32 v3, s2, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s2, v6 +; GCN-NEXT: v_add_f32_e32 v3, v11, v3 ; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; GCN-NEXT: v_ldexp_f32 v1, v3, v1 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[96:99] +; GCN-NEXT: v_cvt_i32_f32_e32 v10, v10 ; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 -; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; GCN-NEXT: v_add_u32_e32 v9, 0x6000, v1 +; GCN-NEXT: v_ldexp_f32 v3, v3, v10 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: s_nop 4 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 @@ -1355,78 +1300,161 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 ; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; GCN-NEXT: v_mul_f32_e32 v3, s3, v2 +; GCN-NEXT: v_rndne_f32_e32 v10, v3 +; GCN-NEXT: v_sub_f32_e32 v11, v3, v10 +; GCN-NEXT: v_fma_f32 v3, s3, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s3, v6 +; GCN-NEXT: v_add_f32_e32 v3, v11, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v10, v10 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; GCN-NEXT: v_ldexp_f32 v1, v3, v10 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v1, a[0:31] +; GCN-NEXT: v_mul_f32_e32 v1, s8, v2 +; GCN-NEXT: v_rndne_f32_e32 v3, v1 +; GCN-NEXT: v_sub_f32_e32 v10, v1, v3 +; GCN-NEXT: v_fma_f32 v1, s8, v2, -v1 +; GCN-NEXT: v_fmac_f32_e32 v1, s8, v6 +; GCN-NEXT: v_add_f32_e32 v1, v10, v1 +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; GCN-NEXT: v_ldexp_f32 v1, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GCN-NEXT: s_nop 6 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-NEXT: ds_read_b128 a[28:31], v9 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v9 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v9 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v9 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v9 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v9 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v9 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v9 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v1, a[0:31] +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v6, v4, v5 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s0, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s0, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s0, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v4, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v5, v3, v4 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s0, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s0, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v5, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 1.0 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v0, s1, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v3, v0 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v9, v0, v3 +; EXACTCUTOFF-NEXT: v_fma_f32 v0, s1, v2, -v0 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v0, s1, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v0, v9, v0 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v0, v0 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v3 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 ; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 @@ -1434,104 +1462,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 ; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v0, v0, v3 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s2, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v3 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 -; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v0, a[0:31] +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v3, v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s2, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s2, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v11, v3 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v10 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v9, 0x6000, v1 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v10 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 4 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 @@ -1540,30 +1496,96 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s3, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v3, v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s3, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s3, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v11, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v10 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 2 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v10 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v1, a[0:31] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v1, s8, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v3, v1 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v1, v3 +; EXACTCUTOFF-NEXT: v_fma_f32 v1, s8, v2, -v1 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v1, s8, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v10, v1 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v2, v3 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v2 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; EXACTCUTOFF-NEXT: s_nop 6 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v9 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v9 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v9 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v9 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v9 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v9 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v9 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v9 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v1, a[0:31] +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 2 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll index d2712ac8e08a3..7710abf31b62f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll @@ -24,11 +24,11 @@ define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, < ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 @@ -86,41 +86,23 @@ entry: } define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { -; GFX1250-SDAG-LABEL: tensor_load_to_lds_d2_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_load_to_lds_d2_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_load_to_lds_d2_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %D0, <8 x i32> %D1, i32 27) ret void @@ -143,11 +125,11 @@ define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 @@ -204,41 +186,23 @@ entry: } define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { -; GFX1250-SDAG-LABEL: tensor_store_from_lds_d2_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_store_from_lds s[8:11], s[0:7] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_store_from_lds_d2_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_store_from_lds s[8:11], s[0:7] -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_store_from_lds_d2_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_store_from_lds s[8:11], s[0:7] +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %D0, <8 x i32> %D1, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index a10c861601c2c..6302b53698cdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -3289,58 +3289,58 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 ; GFX802-SDAG-LABEL: test_writelane_v8f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v27, vcc, 48, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v15 +; GFX802-SDAG-NEXT: v_add_u32_e32 v15, vcc, 16, v0 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v16 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v17 +; GFX802-SDAG-NEXT: v_add_u32_e32 v17, vcc, 32, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[19:22], v[0:1] ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v18 -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1] -; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 16, v0 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 -; GFX802-SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[23:26], v[27:28] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v13 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[15:16] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v11 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s16, v10 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[17:18] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s17, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s18, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s19, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s20, v2 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[2:5], v[22:23] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v15 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v14 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v13 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v12 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v11 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v10 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v16 -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX802-SDAG-NEXT: v_writelane_b32 v21, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v20, s6, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v19, s7, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v18, s8, m0 -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 -; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc -; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v17 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v14 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(3) -; GFX802-SDAG-NEXT: v_writelane_b32 v5, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v3, s6, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v2, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v22, s17, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v21, s18, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v20, s19, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v19, s20, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX802-SDAG-NEXT: v_writelane_b32 v26, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v25, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v24, s11, m0 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX802-SDAG-NEXT: v_writelane_b32 v9, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v23, s12, m0 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v14, s13, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v13, s14, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v12, s15, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v8, s9, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v7, s10, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s11, m0 -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[6:9] -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s13, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s14, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s15, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s16, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[17:18], v[10:13] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[27:28], v[23:26] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[15:16], v[6:9] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3450,58 +3450,58 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 ; GFX802-GISEL-LABEL: test_writelane_v8f64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v18 -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[18:21], v[0:1] -; GFX802-GISEL-NEXT: v_add_u32_e32 v22, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_add_u32_e32 v27, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v28, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v29, vcc, 32, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v30, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s13, v10 +; GFX802-GISEL-NEXT: v_add_u32_e32 v10, vcc, 48, v0 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[19:22], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s14, v11 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[23:26], v[27:28] ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 -; GFX802-GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[2:5], v[29:30] +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[6:9], v[10:11] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v18 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[2:5], v[22:23] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v7 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v11 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v12 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v13 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v14 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s13, v15 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s14, v16 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s15, v17 -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX802-GISEL-NEXT: v_writelane_b32 v18, s4, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v19, s6, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v20, s7, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v21, s8, m0 -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[18:21] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v6 -; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc -; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v8 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v9 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v10 -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[6:9], v[18:19] -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s15, v12 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s16, v13 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s17, v14 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s18, v15 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s19, v16 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s20, v17 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v3, s5, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v4, s6, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v5, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v19, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v20, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v21, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v22, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX802-GISEL-NEXT: v_writelane_b32 v23, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v24, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v25, s11, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v26, s12, m0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX802-GISEL-NEXT: v_writelane_b32 v6, s8, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v7, s9, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v8, s10, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v9, s11, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s13, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v3, s14, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s15, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v5, s16, m0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: v_writelane_b32 v10, s12, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v11, s13, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v12, s14, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v13, s15, m0 -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[6:9] -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX802-GISEL-NEXT: v_writelane_b32 v6, s17, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s18, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s19, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s20, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[27:28], v[23:26] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[29:30], v[2:5] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[6:9] ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 92a2f54841eed..dc2233a664f1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -2279,173 +2279,173 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v19 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v16 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 ; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29 +; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v17 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v16 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v21 -; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v12 +; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v30 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v18, v13 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_max_f32_e32 v20, v13, v19 ; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v16 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GFX7-NEXT: v_max_f32_e32 v13, v18, v13 -; GFX7-NEXT: v_max_f32_e32 v18, v17, v12 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v17, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_mov_b32_e32 v19, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v16 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v23 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v19, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] ; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v16 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[14:15] ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v19, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] ; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v16 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[18:19] -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v16 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[20:21] -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v16 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v16 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[22:23] +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v17 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v26 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v17 +; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v18 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v27 +; GFX7-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v21 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v18 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v28 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v18 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v18 +; GFX7-NEXT: v_max_f32_e32 v18, v14, v21 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v12 +; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 ; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v19, v18, s[28:29] -; GFX7-NEXT: v_max_f32_e32 v18, v14, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v19, v18, vcc -; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_max_f16_e32 v16, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX8-NEXT: v_max_f16_e32 v21, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX8-NEXT: v_max_f16_e32 v16, v19, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v19, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GFX8-NEXT: v_max_f16_e32 v22, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GFX8-NEXT: v_max_f16_e32 v23, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_max_f16_e32 v17, v21, v20 +; GFX8-NEXT: v_max_f16_e32 v24, v23, v22 +; GFX8-NEXT: v_mov_b32_e32 v26, 0x7e00 +; GFX8-NEXT: v_max_f16_e32 v19, v18, v25 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v21, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v23, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v25 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_max_f16_e32 v24, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v25, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 -; GFX8-NEXT: v_max_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_max_f16_e32 v22, v21, v20 +; GFX8-NEXT: v_max_f16_e32 v25, v18, v23 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v21, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v23 +; GFX8-NEXT: v_max_f16_e32 v23, v6, v14 ; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX8-NEXT: v_max_f16_e32 v18, v21, v20 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v21, v20 +; GFX8-NEXT: v_max_f16_e32 v21, v4, v12 ; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v10 -; GFX8-NEXT: v_max_f16_e32 v11, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc +; GFX8-NEXT: v_max_f16_e32 v12, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v26, v6, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_max_f16_e32 v13, v7, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_max_f16_e32 v20, v5, v13 +; GFX8-NEXT: v_max_f16_e32 v14, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v7, v15 ; GFX8-NEXT: v_max_f16_e32 v7, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v26, v12, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_max_f16_e32 v12, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v7, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_max_f16_e32 v15, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v26, v18, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v26, v7, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v12, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v19, v13, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; GFX8-NEXT: v_max_f16_e32 v4, v13, v5 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v13, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v26, v17, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v26, v19, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v26, v22, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v26, v25, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v26, v15, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v4, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v26, v24, s[6:7] +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v26, v14, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v26, v23, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v26, v20, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v26, v21, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 6c4f13a4eab8f..c7e534852bff9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -1667,169 +1667,169 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_maximum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX7-NEXT: v_max_f32_e32 v19, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX7-NEXT: v_max_f32_e32 v20, v14, v30 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX8-NEXT: v_max_f32_e32 v19, v13, v29 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX8-NEXT: v_max_f32_e32 v20, v14, v30 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX900-NEXT: v_max_f32_e32 v19, v13, v29 ; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX900-NEXT: v_max_f32_e32 v20, v14, v30 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index f971080e02c5b..fd11a3c801d38 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -1745,38 +1745,38 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX7-NEXT: v_max_f64 v[16:17], v[2:3], v[18:19] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX7-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX7-NEXT: v_max_f64 v[16:17], v[4:5], v[20:21] ; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX7-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX7-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX7-NEXT: v_max_f64 v[36:37], v[6:7], v[22:23] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX7-NEXT: v_max_f64 v[22:23], v[8:9], v[24:25] +; GFX7-NEXT: v_max_f64 v[18:19], v[8:9], v[24:25] ; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX7-NEXT: v_max_f64 v[24:25], v[10:11], v[26:27] +; GFX7-NEXT: v_max_f64 v[20:21], v[10:11], v[26:27] ; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX7-NEXT: v_max_f64 v[26:27], v[12:13], v[28:29] +; GFX7-NEXT: v_max_f64 v[22:23], v[12:13], v[28:29] ; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX7-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f64: @@ -1785,38 +1785,38 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX8-NEXT: v_max_f64 v[16:17], v[2:3], v[18:19] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX8-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX8-NEXT: v_max_f64 v[16:17], v[4:5], v[20:21] ; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX8-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX8-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX8-NEXT: v_max_f64 v[36:37], v[6:7], v[22:23] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX8-NEXT: v_max_f64 v[22:23], v[8:9], v[24:25] +; GFX8-NEXT: v_max_f64 v[18:19], v[8:9], v[24:25] ; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX8-NEXT: v_max_f64 v[24:25], v[10:11], v[26:27] +; GFX8-NEXT: v_max_f64 v[20:21], v[10:11], v[26:27] ; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX8-NEXT: v_max_f64 v[26:27], v[12:13], v[28:29] +; GFX8-NEXT: v_max_f64 v[22:23], v[12:13], v[28:29] ; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v8f64: @@ -1825,38 +1825,38 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX900-NEXT: v_max_f64 v[16:17], v[2:3], v[18:19] -; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX900-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_max_f64 v[16:17], v[4:5], v[20:21] ; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX900-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX900-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX900-NEXT: v_max_f64 v[36:37], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX900-NEXT: v_max_f64 v[22:23], v[8:9], v[24:25] +; GFX900-NEXT: v_max_f64 v[18:19], v[8:9], v[24:25] ; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX900-NEXT: v_max_f64 v[24:25], v[10:11], v[26:27] +; GFX900-NEXT: v_max_f64 v[20:21], v[10:11], v[26:27] ; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX900-NEXT: v_max_f64 v[26:27], v[12:13], v[28:29] +; GFX900-NEXT: v_max_f64 v[22:23], v[12:13], v[28:29] ; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX900-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX900-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v8f64: @@ -2522,120 +2522,121 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-LABEL: v_maximum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x18 +; GFX10-NEXT: s_clause 0x14 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_max_f64 v[82:83], v[0:1], v[31:32] +; GFX10-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:40 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_max_f64 v[86:87], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_max_f64 v[84:85], v[2:3], v[33:34] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[33:34] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_max_f64 v[32:33], v[4:5], v[35:36] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 +; GFX10-NEXT: s_clause 0xb +; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_max_f64 v[34:35], v[6:7], v[48:49] -; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[48:49] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_max_f64 v[48:49], v[8:9], v[37:38] -; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[37:38] -; GFX10-NEXT: v_max_f64 v[36:37], v[10:11], v[64:65] -; GFX10-NEXT: v_max_f64 v[38:39], v[12:13], v[54:55] -; GFX10-NEXT: v_max_f64 v[54:55], v[14:15], v[52:53] +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX10-NEXT: s_waitcnt vmcnt(15) +; GFX10-NEXT: v_max_f64 v[96:97], v[2:3], v[84:85] +; GFX10-NEXT: s_waitcnt vmcnt(14) +; GFX10-NEXT: v_max_f64 v[98:99], v[4:5], v[82:83] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[84:85] +; GFX10-NEXT: s_waitcnt vmcnt(13) +; GFX10-NEXT: v_max_f64 v[84:85], v[6:7], v[80:81] +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: v_max_f64 v[100:101], v[8:9], v[70:71] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[82:83] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[80:81] +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[70:71] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v86, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v87, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v96, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v97, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v98, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v99, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v84, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v85, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v100, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v101, 0x7ff80000, s7 ; GFX10-NEXT: s_waitcnt vmcnt(11) -; GFX10-NEXT: v_max_f64 v[64:65], v[20:21], v[70:71] -; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] +; GFX10-NEXT: v_max_f64 v[70:71], v[10:11], v[68:69] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[68:69] +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_max_f64 v[68:69], v[12:13], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] +; GFX10-NEXT: v_max_f64 v[66:67], v[14:15], v[64:65] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[64:65] ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_max_f64 v[52:53], v[16:17], v[50:51] -; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] -; GFX10-NEXT: v_max_f64 v[50:51], v[18:19], v[80:81] -; GFX10-NEXT: v_max_f64 v[70:71], v[22:23], v[68:69] -; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v35, 0x7ff80000, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, 0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, 0x7ff80000, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v36, 0, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v37, 0x7ff80000, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v38, 0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, 0x7ff80000, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v54, 0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v55, 0x7ff80000, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v52, 0, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v53, 0x7ff80000, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v50, 0, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v51, 0x7ff80000, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v64, 0, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 +; GFX10-NEXT: v_max_f64 v[64:65], v[16:17], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[52:53] ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_max_f64 v[68:69], v[24:25], v[66:67] -; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] +; GFX10-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] +; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[50:51] ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_max_f64 v[66:67], v[26:27], v[0:1] -; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[0:1] +; GFX10-NEXT: v_max_f64 v[50:51], v[22:23], v[48:49] +; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[48:49] +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_max_f64 v[48:49], v[24:25], v[38:39] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[38:39] ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_max_f64 v[80:81], v[28:29], v[2:3] -; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: v_max_f64 v[38:39], v[26:27], v[36:37] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[36:37] +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_max_f64 v[36:37], v[28:29], v[34:35] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[34:35] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[86:87], v[30:31], v[4:5] -; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v32, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v33, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v86, 0, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, 0x7ff80000, s18 +; GFX10-NEXT: v_max_f64 v[34:35], v[30:31], v[32:33] +; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[32:33] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v70, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v71, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v68, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v69, 0x7ff80000, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v66, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v67, 0x7ff80000, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v64, 0, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v65, 0x7ff80000, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, 0, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v55, 0x7ff80000, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v52, 0, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v53, 0x7ff80000, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v50, 0, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v51, 0x7ff80000, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v48, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v49, 0x7ff80000, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v26, v38, 0, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v39, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v36, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v37, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v34, 0, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v35, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 9e82b41bb9585..426f486a0a033 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1784,87 +1784,87 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_minimum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_min_f16_e32 v16, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX8-NEXT: v_min_f16_e32 v21, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX8-NEXT: v_min_f16_e32 v16, v19, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v19, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GFX8-NEXT: v_min_f16_e32 v22, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GFX8-NEXT: v_min_f16_e32 v23, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_min_f16_e32 v17, v21, v20 +; GFX8-NEXT: v_min_f16_e32 v24, v23, v22 +; GFX8-NEXT: v_mov_b32_e32 v26, 0x7e00 +; GFX8-NEXT: v_min_f16_e32 v19, v18, v25 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v21, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v23, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v25 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_min_f16_e32 v24, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v25, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 -; GFX8-NEXT: v_min_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_min_f16_e32 v22, v21, v20 +; GFX8-NEXT: v_min_f16_e32 v25, v18, v23 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v21, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v23 +; GFX8-NEXT: v_min_f16_e32 v23, v6, v14 ; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 -; GFX8-NEXT: v_min_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_min_f16_e32 v6, v3, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX8-NEXT: v_min_f16_e32 v18, v21, v20 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v21, v20 +; GFX8-NEXT: v_min_f16_e32 v21, v4, v12 ; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 -; GFX8-NEXT: v_min_f16_e32 v3, v2, v10 -; GFX8-NEXT: v_min_f16_e32 v11, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc +; GFX8-NEXT: v_min_f16_e32 v12, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v26, v6, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_min_f16_e32 v13, v7, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_min_f16_e32 v20, v5, v13 +; GFX8-NEXT: v_min_f16_e32 v14, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v7, v15 ; GFX8-NEXT: v_min_f16_e32 v7, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v26, v12, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_min_f16_e32 v12, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v7, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_min_f16_e32 v15, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v26, v18, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v26, v7, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v12, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v19, v13, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; GFX8-NEXT: v_min_f16_e32 v4, v13, v5 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v13, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v26, v17, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v26, v19, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v26, v22, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v26, v25, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v26, v15, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v4, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v26, v24, s[6:7] +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v26, v14, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v26, v23, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v26, v20, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v26, v21, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 8adbe861fe6f0..9638cd03954bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -1667,169 +1667,169 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_minimum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX7-NEXT: v_min_f32_e32 v19, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX7-NEXT: v_min_f32_e32 v20, v14, v30 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX8-NEXT: v_min_f32_e32 v19, v13, v29 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX8-NEXT: v_min_f32_e32 v20, v14, v30 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v18 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 ; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 ; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v8, v24 ; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v9, v25 ; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v10, v26 ; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 +; GFX900-NEXT: v_min_f32_e32 v19, v13, v29 ; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v0, v16 +; GFX900-NEXT: v_min_f32_e32 v20, v14, v30 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 ; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v0, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v20, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index dfd67873c3b86..dc86ac985dcfa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -1745,38 +1745,38 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX7-NEXT: v_min_f64 v[16:17], v[2:3], v[18:19] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX7-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX7-NEXT: v_min_f64 v[16:17], v[4:5], v[20:21] ; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX7-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX7-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX7-NEXT: v_min_f64 v[36:37], v[6:7], v[22:23] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX7-NEXT: v_min_f64 v[22:23], v[8:9], v[24:25] +; GFX7-NEXT: v_min_f64 v[18:19], v[8:9], v[24:25] ; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX7-NEXT: v_min_f64 v[24:25], v[10:11], v[26:27] +; GFX7-NEXT: v_min_f64 v[20:21], v[10:11], v[26:27] ; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX7-NEXT: v_min_f64 v[26:27], v[12:13], v[28:29] +; GFX7-NEXT: v_min_f64 v[22:23], v[12:13], v[28:29] ; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] ; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX7-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v8f64: @@ -1785,38 +1785,38 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX8-NEXT: v_min_f64 v[16:17], v[2:3], v[18:19] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX8-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX8-NEXT: v_min_f64 v[16:17], v[4:5], v[20:21] ; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX8-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX8-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX8-NEXT: v_min_f64 v[36:37], v[6:7], v[22:23] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX8-NEXT: v_min_f64 v[22:23], v[8:9], v[24:25] +; GFX8-NEXT: v_min_f64 v[18:19], v[8:9], v[24:25] ; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX8-NEXT: v_min_f64 v[24:25], v[10:11], v[26:27] +; GFX8-NEXT: v_min_f64 v[20:21], v[10:11], v[26:27] ; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX8-NEXT: v_min_f64 v[26:27], v[12:13], v[28:29] +; GFX8-NEXT: v_min_f64 v[22:23], v[12:13], v[28:29] ; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v8f64: @@ -1825,38 +1825,38 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX900-NEXT: v_min_f64 v[16:17], v[2:3], v[18:19] -; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] -; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX900-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_min_f64 v[16:17], v[4:5], v[20:21] ; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX900-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_mov_b32_e32 v38, 0x7ff80000 +; GFX900-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX900-NEXT: v_min_f64 v[36:37], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[22:23] -; GFX900-NEXT: v_min_f64 v[22:23], v[8:9], v[24:25] +; GFX900-NEXT: v_min_f64 v[18:19], v[8:9], v[24:25] ; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX900-NEXT: v_min_f64 v[24:25], v[10:11], v[26:27] +; GFX900-NEXT: v_min_f64 v[20:21], v[10:11], v[26:27] ; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX900-NEXT: v_min_f64 v[26:27], v[12:13], v[28:29] +; GFX900-NEXT: v_min_f64 v[22:23], v[12:13], v[28:29] ; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX900-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v34, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v22, 0, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v23, v34, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v24, 0, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v25, v34, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v26, 0, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v13, v27, v34, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e32 v1, v33, v38, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v16, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v38, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v34, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v35, v38, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v36, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v37, v38, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v20, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v19, v38, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v22, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v38, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v23, v38, s[14:15] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] ; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] ; GFX900-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v38, vcc ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v8f64: @@ -2522,120 +2522,121 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-LABEL: v_minimum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x18 +; GFX10-NEXT: s_clause 0x14 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_min_f64 v[82:83], v[0:1], v[31:32] +; GFX10-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:40 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_min_f64 v[86:87], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_min_f64 v[84:85], v[2:3], v[33:34] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[33:34] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_min_f64 v[32:33], v[4:5], v[35:36] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 +; GFX10-NEXT: s_clause 0xb +; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_min_f64 v[34:35], v[6:7], v[48:49] -; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[48:49] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_min_f64 v[48:49], v[8:9], v[37:38] -; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[37:38] -; GFX10-NEXT: v_min_f64 v[36:37], v[10:11], v[64:65] -; GFX10-NEXT: v_min_f64 v[38:39], v[12:13], v[54:55] -; GFX10-NEXT: v_min_f64 v[54:55], v[14:15], v[52:53] +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX10-NEXT: s_waitcnt vmcnt(15) +; GFX10-NEXT: v_min_f64 v[96:97], v[2:3], v[84:85] +; GFX10-NEXT: s_waitcnt vmcnt(14) +; GFX10-NEXT: v_min_f64 v[98:99], v[4:5], v[82:83] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[84:85] +; GFX10-NEXT: s_waitcnt vmcnt(13) +; GFX10-NEXT: v_min_f64 v[84:85], v[6:7], v[80:81] +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: v_min_f64 v[100:101], v[8:9], v[70:71] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[82:83] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[80:81] +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[70:71] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v86, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v87, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v96, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v97, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v98, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v99, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v84, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v85, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v100, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v101, 0x7ff80000, s7 ; GFX10-NEXT: s_waitcnt vmcnt(11) -; GFX10-NEXT: v_min_f64 v[64:65], v[20:21], v[70:71] -; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] +; GFX10-NEXT: v_min_f64 v[70:71], v[10:11], v[68:69] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[68:69] +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_min_f64 v[68:69], v[12:13], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] +; GFX10-NEXT: v_min_f64 v[66:67], v[14:15], v[64:65] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[64:65] ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_min_f64 v[52:53], v[16:17], v[50:51] -; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] -; GFX10-NEXT: v_min_f64 v[50:51], v[18:19], v[80:81] -; GFX10-NEXT: v_min_f64 v[70:71], v[22:23], v[68:69] -; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v35, 0x7ff80000, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, 0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, 0x7ff80000, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v36, 0, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v37, 0x7ff80000, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v38, 0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, 0x7ff80000, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v54, 0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v55, 0x7ff80000, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v52, 0, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v53, 0x7ff80000, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v50, 0, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v51, 0x7ff80000, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v64, 0, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 +; GFX10-NEXT: v_min_f64 v[64:65], v[16:17], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[52:53] ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_min_f64 v[68:69], v[24:25], v[66:67] -; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] +; GFX10-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] +; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[50:51] ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_min_f64 v[66:67], v[26:27], v[0:1] -; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[0:1] +; GFX10-NEXT: v_min_f64 v[50:51], v[22:23], v[48:49] +; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[48:49] +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_min_f64 v[48:49], v[24:25], v[38:39] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[38:39] ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_min_f64 v[80:81], v[28:29], v[2:3] -; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: v_min_f64 v[38:39], v[26:27], v[36:37] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[36:37] +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_min_f64 v[36:37], v[28:29], v[34:35] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[34:35] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_min_f64 v[86:87], v[30:31], v[4:5] -; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v32, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v33, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v86, 0, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, 0x7ff80000, s18 +; GFX10-NEXT: v_min_f64 v[34:35], v[30:31], v[32:33] +; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[32:33] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v70, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v71, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v68, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v69, 0x7ff80000, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v66, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v67, 0x7ff80000, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v64, 0, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v65, 0x7ff80000, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, 0, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v55, 0x7ff80000, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v52, 0, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v53, 0x7ff80000, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v50, 0, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v51, 0x7ff80000, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v48, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v49, 0x7ff80000, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v26, v38, 0, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v39, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v36, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v37, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v34, 0, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v35, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 355f77acfd302..08d4ba7b25eaa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -625,30 +625,30 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] +; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] +; CI-NEXT: v_add_f64 v[8:9], s[22:23], -v[16:17] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s17 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] -; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] +; CI-NEXT: v_mov_b32_e32 v20, s17 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v20 +; CI-NEXT: v_add_f64 v[20:21], s[20:21], -v[18:19] +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[20:21]|, 0.5 +; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s23 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v13, s6, v22, v13 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: v_mov_b32_e32 v14, s23 -; CI-NEXT: v_mov_b32_e32 v20, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 -; CI-NEXT: v_mov_b32_e32 v21, s21 ; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] -; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 +; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v16, s21 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 ; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..9d5cd44f8b028 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1843,9 +1843,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10003 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10001 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10001 +; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10007 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10009 ; GFX8-NEXT: s_bfe_u32 s8, s2, 0x1000d ; GFX8-NEXT: s_and_b32 s9, s2, 1 @@ -1861,33 +1861,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x1000e ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_mov_b32_e32 v7, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v11, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s16 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: v_mov_b32_e32 v10, s15 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_mov_b32_e32 v13, s4 +; GFX8-NEXT: v_mov_b32_e32 v13, s5 +; GFX8-NEXT: v_mov_b32_e32 v15, s4 ; GFX8-NEXT: v_mov_b32_e32 v14, s14 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] @@ -2075,42 +2075,42 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v18, v[0:1] +; GFX8-NEXT: flat_load_ushort v12, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 +; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v7, v18, 15, 1 -; GFX8-NEXT: v_bfe_i32 v6, v18, 14, 1 -; GFX8-NEXT: v_bfe_i32 v5, v18, 13, 1 -; GFX8-NEXT: v_bfe_i32 v4, v18, 12, 1 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_bfe_i32 v11, v18, 11, 1 -; GFX8-NEXT: v_bfe_i32 v10, v18, 10, 1 -; GFX8-NEXT: v_bfe_i32 v9, v18, 9, 1 -; GFX8-NEXT: v_bfe_i32 v8, v18, 8, 1 -; GFX8-NEXT: v_bfe_i32 v3, v18, 3, 1 -; GFX8-NEXT: v_bfe_i32 v2, v18, 2, 1 -; GFX8-NEXT: v_bfe_i32 v1, v18, 1, 1 -; GFX8-NEXT: v_bfe_i32 v0, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v18, 7, 1 -; GFX8-NEXT: v_bfe_i32 v6, v18, 6, 1 -; GFX8-NEXT: v_bfe_i32 v5, v18, 5, 1 -; GFX8-NEXT: v_bfe_i32 v4, v18, 4, 1 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX8-NEXT: v_bfe_i32 v3, v12, 3, 1 +; GFX8-NEXT: v_bfe_i32 v7, v12, 7, 1 +; GFX8-NEXT: v_bfe_i32 v11, v12, 11, 1 +; GFX8-NEXT: v_bfe_i32 v15, v12, 15, 1 +; GFX8-NEXT: v_bfe_i32 v2, v12, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v12, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v6, v12, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v12, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v12, 4, 1 +; GFX8-NEXT: v_bfe_i32 v10, v12, 10, 1 +; GFX8-NEXT: v_bfe_i32 v9, v12, 9, 1 +; GFX8-NEXT: v_bfe_i32 v8, v12, 8, 1 +; GFX8-NEXT: v_bfe_i32 v14, v12, 14, 1 +; GFX8-NEXT: v_bfe_i32 v13, v12, 13, 1 +; GFX8-NEXT: v_bfe_i32 v12, v12, 12, 1 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i32: @@ -3177,74 +3177,74 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10001 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10019 -; GFX6-NEXT: s_lshr_b32 s20, s2, 31 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1001d -; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s28, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10019 -; GFX6-NEXT: s_lshr_b32 s37, s3, 31 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x1001d -; GFX6-NEXT: s_and_b32 s12, s2, 1 -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s42, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s43, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s44, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s45, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s49, s2, 0x1001a -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 -; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001e -; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c -; GFX6-NEXT: s_and_b32 s53, s3, 1 -; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 -; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006 -; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000e -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c -; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012 -; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 -; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000a +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10001 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10007 +; GFX6-NEXT: s_bfe_u32 s9, s4, 0x10005 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0x1000b +; GFX6-NEXT: s_bfe_u32 s11, s4, 0x10009 +; GFX6-NEXT: s_bfe_u32 s12, s4, 0x1000f +; GFX6-NEXT: s_bfe_u32 s13, s4, 0x1000d +; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10013 +; GFX6-NEXT: s_bfe_u32 s15, s4, 0x10011 +; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10017 +; GFX6-NEXT: s_bfe_u32 s17, s4, 0x10015 +; GFX6-NEXT: s_bfe_u32 s18, s4, 0x1001b +; GFX6-NEXT: s_bfe_u32 s19, s4, 0x10019 +; GFX6-NEXT: s_lshr_b32 s20, s4, 31 +; GFX6-NEXT: s_bfe_u32 s21, s4, 0x1001d +; GFX6-NEXT: s_bfe_u32 s22, s5, 0x10003 +; GFX6-NEXT: s_bfe_u32 s23, s5, 0x10001 +; GFX6-NEXT: s_bfe_u32 s24, s5, 0x10007 +; GFX6-NEXT: s_bfe_u32 s25, s5, 0x10005 +; GFX6-NEXT: s_bfe_u32 s26, s5, 0x1000b +; GFX6-NEXT: s_bfe_u32 s27, s5, 0x10009 +; GFX6-NEXT: s_bfe_u32 s28, s5, 0x1000f +; GFX6-NEXT: s_bfe_u32 s29, s5, 0x1000d +; GFX6-NEXT: s_bfe_u32 s30, s5, 0x10013 +; GFX6-NEXT: s_bfe_u32 s31, s5, 0x10011 +; GFX6-NEXT: s_bfe_u32 s33, s5, 0x10017 +; GFX6-NEXT: s_bfe_u32 s34, s5, 0x10015 +; GFX6-NEXT: s_bfe_u32 s35, s5, 0x1001b +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x10019 +; GFX6-NEXT: s_lshr_b32 s37, s5, 31 +; GFX6-NEXT: s_bfe_u32 s38, s5, 0x1001d +; GFX6-NEXT: s_and_b32 s39, s4, 1 +; GFX6-NEXT: s_bfe_u32 s40, s4, 0x10002 +; GFX6-NEXT: s_bfe_u32 s41, s4, 0x10006 +; GFX6-NEXT: s_bfe_u32 s42, s4, 0x10004 +; GFX6-NEXT: s_bfe_u32 s43, s4, 0x1000a +; GFX6-NEXT: s_bfe_u32 s44, s4, 0x10008 +; GFX6-NEXT: s_bfe_u32 s45, s4, 0x1000e +; GFX6-NEXT: s_bfe_u32 s46, s4, 0x1000c +; GFX6-NEXT: s_bfe_u32 s47, s4, 0x10012 +; GFX6-NEXT: s_bfe_u32 s48, s4, 0x10010 +; GFX6-NEXT: s_bfe_u32 s49, s4, 0x10016 +; GFX6-NEXT: s_bfe_u32 s50, s4, 0x10014 +; GFX6-NEXT: s_bfe_u32 s51, s4, 0x1001a +; GFX6-NEXT: s_bfe_u32 s52, s4, 0x10018 +; GFX6-NEXT: s_bfe_u32 s53, s4, 0x1001e +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001c +; GFX6-NEXT: s_and_b32 s54, s5, 1 +; GFX6-NEXT: s_bfe_u32 s55, s5, 0x10002 +; GFX6-NEXT: s_bfe_u32 s56, s5, 0x10006 +; GFX6-NEXT: s_bfe_u32 s57, s5, 0x10008 +; GFX6-NEXT: s_bfe_u32 s58, s5, 0x1000e +; GFX6-NEXT: s_bfe_u32 s59, s5, 0x1000c +; GFX6-NEXT: s_bfe_u32 s60, s5, 0x10012 +; GFX6-NEXT: s_bfe_u32 s61, s5, 0x10010 +; GFX6-NEXT: s_bfe_u32 s62, s5, 0x10016 +; GFX6-NEXT: s_bfe_u32 s63, s5, 0x10014 +; GFX6-NEXT: s_bfe_u32 s64, s5, 0x1001a +; GFX6-NEXT: s_bfe_u32 s65, s5, 0x10018 +; GFX6-NEXT: s_bfe_u32 s66, s5, 0x1001e +; GFX6-NEXT: s_bfe_u32 s67, s5, 0x1001c +; GFX6-NEXT: s_bfe_u32 s68, s5, 0x1000a +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x10004 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 ; GFX6-NEXT: v_mov_b32_e32 v1, s38 ; GFX6-NEXT: v_mov_b32_e32 v2, s66 @@ -3254,87 +3254,86 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v6, s64 ; GFX6-NEXT: v_mov_b32_e32 v7, s35 ; GFX6-NEXT: v_mov_b32_e32 v8, s63 +; GFX6-NEXT: v_mov_b32_e32 v12, s61 +; GFX6-NEXT: v_mov_b32_e32 v16, s59 +; GFX6-NEXT: v_mov_b32_e32 v20, s57 ; GFX6-NEXT: v_mov_b32_e32 v9, s34 ; GFX6-NEXT: v_mov_b32_e32 v10, s62 ; GFX6-NEXT: v_mov_b32_e32 v11, s33 -; GFX6-NEXT: v_mov_b32_e32 v12, s61 ; GFX6-NEXT: v_mov_b32_e32 v13, s31 ; GFX6-NEXT: v_mov_b32_e32 v14, s60 ; GFX6-NEXT: v_mov_b32_e32 v15, s30 -; GFX6-NEXT: v_mov_b32_e32 v16, s59 ; GFX6-NEXT: v_mov_b32_e32 v17, s29 ; GFX6-NEXT: v_mov_b32_e32 v18, s58 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s57 ; GFX6-NEXT: v_mov_b32_e32 v19, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NEXT: v_mov_b32_e32 v2, s68 -; GFX6-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NEXT: v_mov_b32_e32 v21, s27 +; GFX6-NEXT: v_mov_b32_e32 v22, s68 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s56 +; GFX6-NEXT: v_mov_b32_e32 v23, s26 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(5) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s55 +; GFX6-NEXT: v_mov_b32_e32 v2, s56 ; GFX6-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s54 ; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v2, s55 ; GFX6-NEXT: v_mov_b32_e32 v3, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s51 +; GFX6-NEXT: v_mov_b32_e32 v2, s53 ; GFX6-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 +; GFX6-NEXT: v_mov_b32_e32 v0, s52 ; GFX6-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s51 ; GFX6-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v0, s48 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s47 ; GFX6-NEXT: v_mov_b32_e32 v3, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s46 ; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s42 ; GFX6-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v0, s39 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -4013,74 +4012,74 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_i32 s5, s2, 0x10002 -; GFX6-NEXT: s_bfe_i32 s6, s2, 0x10001 -; GFX6-NEXT: s_bfe_i32 s7, s2, 0x10000 -; GFX6-NEXT: s_bfe_i32 s8, s2, 0x10007 -; GFX6-NEXT: s_bfe_i32 s9, s2, 0x10006 -; GFX6-NEXT: s_bfe_i32 s10, s2, 0x10005 -; GFX6-NEXT: s_bfe_i32 s11, s2, 0x10004 -; GFX6-NEXT: s_bfe_i32 s12, s2, 0x1000b -; GFX6-NEXT: s_bfe_i32 s13, s2, 0x1000a -; GFX6-NEXT: s_bfe_i32 s14, s2, 0x10009 -; GFX6-NEXT: s_bfe_i32 s15, s2, 0x10008 -; GFX6-NEXT: s_bfe_i32 s16, s2, 0x1000f -; GFX6-NEXT: s_bfe_i32 s17, s2, 0x1000e -; GFX6-NEXT: s_bfe_i32 s18, s2, 0x1000d -; GFX6-NEXT: s_bfe_i32 s19, s2, 0x1000c -; GFX6-NEXT: s_bfe_i32 s20, s2, 0x10013 -; GFX6-NEXT: s_bfe_i32 s21, s2, 0x10012 -; GFX6-NEXT: s_bfe_i32 s22, s2, 0x10011 -; GFX6-NEXT: s_bfe_i32 s23, s2, 0x10010 -; GFX6-NEXT: s_bfe_i32 s24, s2, 0x10017 -; GFX6-NEXT: s_bfe_i32 s25, s2, 0x10016 -; GFX6-NEXT: s_bfe_i32 s26, s2, 0x10015 -; GFX6-NEXT: s_bfe_i32 s27, s2, 0x10014 -; GFX6-NEXT: s_bfe_i32 s28, s2, 0x1001b -; GFX6-NEXT: s_bfe_i32 s29, s2, 0x1001a -; GFX6-NEXT: s_bfe_i32 s30, s2, 0x10019 -; GFX6-NEXT: s_bfe_i32 s31, s2, 0x10018 -; GFX6-NEXT: s_ashr_i32 s33, s2, 31 -; GFX6-NEXT: s_bfe_i32 s34, s2, 0x1001e -; GFX6-NEXT: s_bfe_i32 s35, s2, 0x1001d -; GFX6-NEXT: s_bfe_i32 s36, s2, 0x1001c -; GFX6-NEXT: s_bfe_i32 s37, s3, 0x10003 -; GFX6-NEXT: s_bfe_i32 s38, s3, 0x10002 -; GFX6-NEXT: s_bfe_i32 s39, s3, 0x10001 -; GFX6-NEXT: s_bfe_i32 s40, s3, 0x10000 -; GFX6-NEXT: s_bfe_i32 s41, s3, 0x10007 -; GFX6-NEXT: s_bfe_i32 s42, s3, 0x10006 -; GFX6-NEXT: s_bfe_i32 s43, s3, 0x10005 -; GFX6-NEXT: s_bfe_i32 s44, s3, 0x10004 -; GFX6-NEXT: s_bfe_i32 s45, s3, 0x1000b -; GFX6-NEXT: s_bfe_i32 s46, s3, 0x1000a -; GFX6-NEXT: s_bfe_i32 s47, s3, 0x10009 -; GFX6-NEXT: s_bfe_i32 s48, s3, 0x10008 -; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000e -; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000d -; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000c -; GFX6-NEXT: s_bfe_i32 s52, s3, 0x10013 -; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012 -; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011 -; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010 -; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017 -; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016 -; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015 -; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10014 -; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b -; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a -; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019 -; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018 -; GFX6-NEXT: s_ashr_i32 s64, s3, 31 -; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e -; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d -; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c -; GFX6-NEXT: s_bfe_i32 s68, s3, 0x1000f +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_i32 s7, s4, 0x10002 +; GFX6-NEXT: s_bfe_i32 s8, s4, 0x10001 +; GFX6-NEXT: s_bfe_i32 s9, s4, 0x10000 +; GFX6-NEXT: s_bfe_i32 s10, s4, 0x10007 +; GFX6-NEXT: s_bfe_i32 s11, s4, 0x10006 +; GFX6-NEXT: s_bfe_i32 s12, s4, 0x10005 +; GFX6-NEXT: s_bfe_i32 s13, s4, 0x10004 +; GFX6-NEXT: s_bfe_i32 s14, s4, 0x1000b +; GFX6-NEXT: s_bfe_i32 s15, s4, 0x1000a +; GFX6-NEXT: s_bfe_i32 s16, s4, 0x10009 +; GFX6-NEXT: s_bfe_i32 s17, s4, 0x10008 +; GFX6-NEXT: s_bfe_i32 s18, s4, 0x1000f +; GFX6-NEXT: s_bfe_i32 s19, s4, 0x1000e +; GFX6-NEXT: s_bfe_i32 s20, s4, 0x1000d +; GFX6-NEXT: s_bfe_i32 s21, s4, 0x1000c +; GFX6-NEXT: s_bfe_i32 s22, s4, 0x10013 +; GFX6-NEXT: s_bfe_i32 s23, s4, 0x10012 +; GFX6-NEXT: s_bfe_i32 s24, s4, 0x10011 +; GFX6-NEXT: s_bfe_i32 s25, s4, 0x10010 +; GFX6-NEXT: s_bfe_i32 s26, s4, 0x10017 +; GFX6-NEXT: s_bfe_i32 s27, s4, 0x10016 +; GFX6-NEXT: s_bfe_i32 s28, s4, 0x10015 +; GFX6-NEXT: s_bfe_i32 s29, s4, 0x10014 +; GFX6-NEXT: s_bfe_i32 s30, s4, 0x1001b +; GFX6-NEXT: s_bfe_i32 s31, s4, 0x1001a +; GFX6-NEXT: s_bfe_i32 s33, s4, 0x10019 +; GFX6-NEXT: s_bfe_i32 s34, s4, 0x10018 +; GFX6-NEXT: s_ashr_i32 s35, s4, 31 +; GFX6-NEXT: s_bfe_i32 s36, s4, 0x1001e +; GFX6-NEXT: s_bfe_i32 s37, s4, 0x1001d +; GFX6-NEXT: s_bfe_i32 s4, s4, 0x1001c +; GFX6-NEXT: s_bfe_i32 s38, s5, 0x10003 +; GFX6-NEXT: s_bfe_i32 s39, s5, 0x10002 +; GFX6-NEXT: s_bfe_i32 s40, s5, 0x10001 +; GFX6-NEXT: s_bfe_i32 s41, s5, 0x10000 +; GFX6-NEXT: s_bfe_i32 s42, s5, 0x10007 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10006 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10005 +; GFX6-NEXT: s_bfe_i32 s45, s5, 0x10004 +; GFX6-NEXT: s_bfe_i32 s46, s5, 0x1000a +; GFX6-NEXT: s_bfe_i32 s47, s5, 0x10009 +; GFX6-NEXT: s_bfe_i32 s48, s5, 0x10008 +; GFX6-NEXT: s_bfe_i32 s49, s5, 0x1000e +; GFX6-NEXT: s_bfe_i32 s50, s5, 0x1000d +; GFX6-NEXT: s_bfe_i32 s51, s5, 0x1000c +; GFX6-NEXT: s_bfe_i32 s52, s5, 0x10013 +; GFX6-NEXT: s_bfe_i32 s53, s5, 0x10012 +; GFX6-NEXT: s_bfe_i32 s54, s5, 0x10011 +; GFX6-NEXT: s_bfe_i32 s55, s5, 0x10010 +; GFX6-NEXT: s_bfe_i32 s56, s5, 0x10017 +; GFX6-NEXT: s_bfe_i32 s57, s5, 0x10016 +; GFX6-NEXT: s_bfe_i32 s58, s5, 0x10015 +; GFX6-NEXT: s_bfe_i32 s59, s5, 0x10014 +; GFX6-NEXT: s_bfe_i32 s60, s5, 0x1001b +; GFX6-NEXT: s_bfe_i32 s61, s5, 0x1001a +; GFX6-NEXT: s_bfe_i32 s62, s5, 0x10019 +; GFX6-NEXT: s_bfe_i32 s63, s5, 0x10018 +; GFX6-NEXT: s_ashr_i32 s64, s5, 31 +; GFX6-NEXT: s_bfe_i32 s65, s5, 0x1001e +; GFX6-NEXT: s_bfe_i32 s66, s5, 0x1001d +; GFX6-NEXT: s_bfe_i32 s67, s5, 0x1001c +; GFX6-NEXT: s_bfe_i32 s68, s5, 0x1000f +; GFX6-NEXT: s_bfe_i32 s5, s5, 0x1000b ; GFX6-NEXT: v_mov_b32_e32 v0, s67 ; GFX6-NEXT: v_mov_b32_e32 v1, s66 ; GFX6-NEXT: v_mov_b32_e32 v2, s65 @@ -4090,87 +4089,86 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v6, s61 ; GFX6-NEXT: v_mov_b32_e32 v7, s60 ; GFX6-NEXT: v_mov_b32_e32 v8, s59 +; GFX6-NEXT: v_mov_b32_e32 v12, s55 +; GFX6-NEXT: v_mov_b32_e32 v16, s51 +; GFX6-NEXT: v_mov_b32_e32 v20, s48 ; GFX6-NEXT: v_mov_b32_e32 v9, s58 ; GFX6-NEXT: v_mov_b32_e32 v10, s57 ; GFX6-NEXT: v_mov_b32_e32 v11, s56 -; GFX6-NEXT: v_mov_b32_e32 v12, s55 ; GFX6-NEXT: v_mov_b32_e32 v13, s54 ; GFX6-NEXT: v_mov_b32_e32 v14, s53 ; GFX6-NEXT: v_mov_b32_e32 v15, s52 -; GFX6-NEXT: v_mov_b32_e32 v16, s51 ; GFX6-NEXT: v_mov_b32_e32 v17, s50 ; GFX6-NEXT: v_mov_b32_e32 v18, s49 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 ; GFX6-NEXT: v_mov_b32_e32 v19, s68 -; GFX6-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v21, s47 +; GFX6-NEXT: v_mov_b32_e32 v22, s46 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v23, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(5) +; GFX6-NEXT: v_mov_b32_e32 v0, s45 +; GFX6-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v3, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s41 +; GFX6-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v3, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s31 -; GFX6-NEXT: v_mov_b32_e32 v1, s30 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 -; GFX6-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v3, s30 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s27 -; GFX6-NEXT: v_mov_b32_e32 v1, s26 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 -; GFX6-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NEXT: v_mov_b32_e32 v0, s29 +; GFX6-NEXT: v_mov_b32_e32 v1, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s27 +; GFX6-NEXT: v_mov_b32_e32 v3, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s23 -; GFX6-NEXT: v_mov_b32_e32 v1, s22 -; GFX6-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NEXT: v_mov_b32_e32 v0, s25 +; GFX6-NEXT: v_mov_b32_e32 v1, s24 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s21 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: v_mov_b32_e32 v1, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v1, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6014,46 +6012,46 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v13 +; GFX8-NEXT: v_mov_b32_e32 v15, v13 +; GFX8-NEXT: v_mov_b32_e32 v5, v13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, s4 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v12, v1 -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 -; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v18, s1 -; GFX8-NEXT: v_mov_b32_e32 v17, s0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s2 -; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v6 -; GFX8-NEXT: v_bfe_u32 v0, v6, 6, 1 -; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10] -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: v_mov_b32_e32 v7, v13 +; GFX8-NEXT: v_mov_b32_e32 v9, v13 +; GFX8-NEXT: v_mov_b32_e32 v11, v13 +; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX8-NEXT: v_bfe_u32 v2, v0, 5, 1 +; GFX8-NEXT: v_bfe_u32 v6, v0, 3, 1 +; GFX8-NEXT: v_bfe_u32 v10, v0, 1, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 7, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 +; GFX8-NEXT: v_bfe_u32 v4, v0, 2, 1 +; GFX8-NEXT: v_bfe_u32 v0, v0, 4, 1 +; GFX8-NEXT: v_bfe_u32 v12, v1, 6, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i1_to_v8i64: @@ -6235,26 +6233,26 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s16, s0, 48 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s17 +; GFX8-NEXT: v_mov_b32_e32 v18, s16 +; GFX8-NEXT: s_add_u32 s16, s0, 32 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v21, s17 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 +; GFX8-NEXT: v_mov_b32_e32 v20, s16 ; GFX8-NEXT: v_mov_b32_e32 v9, s7 ; GFX8-NEXT: v_mov_b32_e32 v10, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s9 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v12, s10 @@ -6423,14 +6421,14 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v29, off, s[8:11], 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v12, v1 ; GFX6-NEXT: v_mov_b32_e32 v14, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v12, v1 ; GFX6-NEXT: v_mov_b32_e32 v16, v1 ; GFX6-NEXT: v_mov_b32_e32 v18, v1 ; GFX6-NEXT: v_mov_b32_e32 v20, v1 @@ -6753,67 +6751,66 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 13, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v12, 11, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v14, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v15, 6, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v13, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 15, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v1 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_bfe_i32 v6, v5, 0, 1 +; GFX6-NEXT: v_bfe_i32 v10, v4, 0, 1 +; GFX6-NEXT: v_bfe_i32 v14, v3, 0, 1 +; GFX6-NEXT: v_bfe_i32 v17, v0, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 14, v1 +; GFX6-NEXT: v_bfe_i32 v15, v0, 0, 1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 13, v1 +; GFX6-NEXT: v_bfe_i32 v19, v0, 0, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 1 -; GFX6-NEXT: v_bfe_i32 v4, v9, 0, 1 -; GFX6-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_bfe_i32 v17, v0, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 9, v1 +; GFX6-NEXT: v_bfe_i32 v21, v0, 0, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1 +; GFX6-NEXT: v_bfe_i32 v4, v0, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 4, v1 +; GFX6-NEXT: v_bfe_i32 v8, v0, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 6, v1 +; GFX6-NEXT: v_bfe_i32 v12, v0, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_i32 v9, v12, 0, 1 -; GFX6-NEXT: v_bfe_i32 v7, v11, 0, 1 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX6-NEXT: v_bfe_i32 v11, v1, 0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 7, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 -; GFX6-NEXT: v_bfe_i32 v17, v1, 0, 1 -; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX6-NEXT: v_bfe_i32 v21, v16, 0, 1 -; GFX6-NEXT: v_bfe_i32 v19, v14, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX6-NEXT: v_bfe_i32 v19, v0, 0, 1 +; GFX6-NEXT: v_bfe_i32 v0, v1, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1 +; GFX6-NEXT: v_bfe_i32 v25, v1, 0, 1 +; GFX6-NEXT: v_bfe_i32 v23, v3, 0, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX6-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX6-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 ; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 -; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: @@ -6823,8 +6820,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v19, s1 -; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: v_mov_b32_e32 v21, s1 +; GFX8-NEXT: v_mov_b32_e32 v20, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 14 @@ -6842,7 +6839,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s26, s3, 2 ; GFX8-NEXT: s_lshr_b32 s28, s3, 3 ; GFX8-NEXT: s_lshr_b32 s30, s3, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v16, s3 ; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 @@ -6858,74 +6855,74 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s34, s0, 0x70 +; GFX8-NEXT: s_addc_u32 s35, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s34 +; GFX8-NEXT: v_mov_b32_e32 v6, s35 +; GFX8-NEXT: s_add_u32 s34, s0, 0x60 +; GFX8-NEXT: s_addc_u32 s35, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, s34 +; GFX8-NEXT: v_mov_b32_e32 v10, s35 +; GFX8-NEXT: s_add_u32 s34, s0, 0x50 +; GFX8-NEXT: s_addc_u32 s35, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NEXT: v_mov_b32_e32 v11, s11 -; GFX8-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NEXT: v_mov_b32_e32 v13, s13 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v18, s3 +; GFX8-NEXT: v_mov_b32_e32 v17, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v23, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v13, s34 +; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v7, s19 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: v_mov_b32_e32 v9, s21 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: v_mov_b32_e32 v14, s35 +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: v_mov_b32_e32 v11, s13 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v25, s3 +; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v13, s15 +; GFX8-NEXT: v_mov_b32_e32 v14, s16 +; GFX8-NEXT: v_mov_b32_e32 v15, s17 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v10, s22 -; GFX8-NEXT: v_mov_b32_e32 v11, s23 -; GFX8-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NEXT: v_mov_b32_e32 v13, s25 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[12:15] +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: v_mov_b32_e32 v7, s25 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s26 -; GFX8-NEXT: v_mov_b32_e32 v15, s27 -; GFX8-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NEXT: v_mov_b32_e32 v3, s31 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[14:17] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_bfe_i32 v8, v16, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v16, s26 +; GFX8-NEXT: v_mov_b32_e32 v17, s27 +; GFX8-NEXT: v_mov_b32_e32 v18, s28 +; GFX8-NEXT: v_mov_b32_e32 v19, s29 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX8-NEXT: v_mov_b32_e32 v10, s30 +; GFX8-NEXT: v_mov_b32_e32 v11, s31 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i64: @@ -7796,159 +7793,158 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s2, s4, 30 +; GFX6-NEXT: s_lshr_b32 s14, s4, 28 +; GFX6-NEXT: s_lshr_b32 s12, s4, 26 +; GFX6-NEXT: s_lshr_b32 s10, s4, 24 +; GFX6-NEXT: s_lshr_b32 s8, s4, 22 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s16 +; GFX6-NEXT: v_mov_b32_e32 v5, s14 +; GFX6-NEXT: v_mov_b32_e32 v9, s12 +; GFX6-NEXT: v_mov_b32_e32 v13, s10 +; GFX6-NEXT: v_mov_b32_e32 v17, s8 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s2, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s4, 20 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x10000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s38, s4, 30 -; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s34, s4, 28 -; GFX6-NEXT: s_lshr_b32 s36, s4, 29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 26 -; GFX6-NEXT: s_lshr_b32 s30, s4, 27 -; GFX6-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NEXT: s_lshr_b32 s26, s4, 25 -; GFX6-NEXT: s_lshr_b32 s20, s4, 22 -; GFX6-NEXT: s_lshr_b32 s22, s4, 23 -; GFX6-NEXT: s_lshr_b32 s18, s4, 20 -; GFX6-NEXT: s_lshr_b32 s6, s4, 21 -; GFX6-NEXT: s_lshr_b32 s8, s4, 18 -; GFX6-NEXT: s_lshr_b32 s10, s4, 19 -; GFX6-NEXT: s_lshr_b32 s12, s4, 16 -; GFX6-NEXT: s_lshr_b32 s14, s4, 17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 -; GFX6-NEXT: s_lshr_b32 s42, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NEXT: v_mov_b32_e32 v4, s21 +; GFX6-NEXT: v_mov_b32_e32 v21, s18 +; GFX6-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_lshr_b32 s6, s4, 29 +; GFX6-NEXT: s_lshr_b32 s8, s4, 27 +; GFX6-NEXT: s_lshr_b32 s10, s4, 25 +; GFX6-NEXT: s_lshr_b32 s12, s4, 23 +; GFX6-NEXT: s_lshr_b32 s14, s4, 21 +; GFX6-NEXT: s_lshr_b32 s16, s4, 18 +; GFX6-NEXT: s_lshr_b32 s18, s4, 19 +; GFX6-NEXT: s_lshr_b32 s20, s4, 16 +; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_lshr_b32 s24, s4, 14 +; GFX6-NEXT: s_lshr_b32 s26, s4, 15 +; GFX6-NEXT: s_lshr_b32 s28, s4, 12 +; GFX6-NEXT: s_lshr_b32 s30, s4, 13 +; GFX6-NEXT: s_lshr_b32 s34, s4, 10 +; GFX6-NEXT: s_lshr_b32 s36, s4, 11 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: s_lshr_b32 s38, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NEXT: s_lshr_b32 s40, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_mov_b32_e32 v8, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s13 +; GFX6-NEXT: s_lshr_b32 s8, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v11, s42 +; GFX6-NEXT: v_mov_b32_e32 v12, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[10:11], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v15, s44 +; GFX6-NEXT: v_mov_b32_e32 v16, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 2 +; GFX6-NEXT: v_mov_b32_e32 v18, s9 +; GFX6-NEXT: s_lshr_b32 s46, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s34 -; GFX6-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v10, s28 -; GFX6-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s24 -; GFX6-NEXT: v_mov_b32_e32 v15, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v16, s26 -; GFX6-NEXT: v_mov_b32_e32 v17, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v19, s12 +; GFX6-NEXT: v_mov_b32_e32 v20, s13 +; GFX6-NEXT: v_mov_b32_e32 v22, s19 +; GFX6-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v23, s14 +; GFX6-NEXT: v_mov_b32_e32 v24, s15 +; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: s_waitcnt expcnt(4) +; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v4, s22 ; GFX6-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NEXT: v_mov_b32_e32 v5, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s42 -; GFX6-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: v_mov_b32_e32 v4, s30 +; GFX6-NEXT: v_mov_b32_e32 v5, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v4, s36 +; GFX6-NEXT: v_mov_b32_e32 v5, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: v_mov_b32_e32 v4, s20 -; GFX6-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NEXT: v_mov_b32_e32 v5, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -8638,202 +8634,202 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019 -; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d -; GFX6-NEXT: s_lshr_b32 s34, s2, 31 -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s49, s3, 31 -; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 -; GFX6-NEXT: s_and_b32 s7, s2, 1 -; GFX6-NEXT: s_and_b32 s10, s3, 1 -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 -; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a -; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c -; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e -; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 -; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10006 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000a -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c -; GFX6-NEXT: s_bfe_u32 s60, s3, 0x1000e -; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 -; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10012 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1001c +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10005 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0x10007 +; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10009 +; GFX6-NEXT: s_bfe_u32 s15, s4, 0x1000b +; GFX6-NEXT: s_bfe_u32 s17, s4, 0x1000d +; GFX6-NEXT: s_bfe_u32 s19, s4, 0x1000f +; GFX6-NEXT: s_bfe_u32 s21, s4, 0x10011 +; GFX6-NEXT: s_bfe_u32 s23, s4, 0x10013 +; GFX6-NEXT: s_bfe_u32 s25, s4, 0x10015 +; GFX6-NEXT: s_bfe_u32 s27, s4, 0x10017 +; GFX6-NEXT: s_bfe_u32 s29, s4, 0x10019 +; GFX6-NEXT: s_bfe_u32 s31, s4, 0x1001b +; GFX6-NEXT: s_bfe_u32 s34, s4, 0x1001d +; GFX6-NEXT: s_lshr_b32 s35, s4, 31 +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x10003 +; GFX6-NEXT: s_bfe_u32 s37, s5, 0x10005 +; GFX6-NEXT: s_bfe_u32 s38, s5, 0x10007 +; GFX6-NEXT: s_bfe_u32 s39, s5, 0x10009 +; GFX6-NEXT: s_bfe_u32 s40, s5, 0x1000b +; GFX6-NEXT: s_bfe_u32 s41, s5, 0x1000d +; GFX6-NEXT: s_bfe_u32 s42, s5, 0x1000f +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10011 +; GFX6-NEXT: s_bfe_u32 s44, s5, 0x10013 +; GFX6-NEXT: s_bfe_u32 s45, s5, 0x10015 +; GFX6-NEXT: s_bfe_u32 s46, s5, 0x10017 +; GFX6-NEXT: s_bfe_u32 s47, s5, 0x10019 +; GFX6-NEXT: s_bfe_u32 s48, s5, 0x1001b +; GFX6-NEXT: s_bfe_u32 s49, s5, 0x1001d +; GFX6-NEXT: s_lshr_b32 s50, s5, 31 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0x10001 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10001 +; GFX6-NEXT: s_and_b32 s9, s4, 1 +; GFX6-NEXT: s_and_b32 s12, s5, 1 +; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10002 +; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10004 +; GFX6-NEXT: s_bfe_u32 s18, s4, 0x10006 +; GFX6-NEXT: s_bfe_u32 s20, s4, 0x10008 +; GFX6-NEXT: s_bfe_u32 s22, s4, 0x1000a +; GFX6-NEXT: s_bfe_u32 s24, s4, 0x1000c +; GFX6-NEXT: s_bfe_u32 s26, s4, 0x1000e +; GFX6-NEXT: s_bfe_u32 s28, s4, 0x10010 +; GFX6-NEXT: s_bfe_u32 s30, s4, 0x10012 +; GFX6-NEXT: s_bfe_u32 s33, s4, 0x10014 +; GFX6-NEXT: s_bfe_u32 s51, s4, 0x10016 +; GFX6-NEXT: s_bfe_u32 s52, s4, 0x10018 +; GFX6-NEXT: s_bfe_u32 s53, s4, 0x1001a +; GFX6-NEXT: s_bfe_u32 s54, s4, 0x1001c +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001e +; GFX6-NEXT: s_bfe_u32 s55, s5, 0x10002 +; GFX6-NEXT: s_bfe_u32 s56, s5, 0x10004 +; GFX6-NEXT: s_bfe_u32 s57, s5, 0x10006 +; GFX6-NEXT: s_bfe_u32 s58, s5, 0x10008 +; GFX6-NEXT: s_bfe_u32 s59, s5, 0x1000a +; GFX6-NEXT: s_bfe_u32 s60, s5, 0x1000c +; GFX6-NEXT: s_bfe_u32 s61, s5, 0x1000e +; GFX6-NEXT: s_bfe_u32 s62, s5, 0x10010 +; GFX6-NEXT: s_bfe_u32 s63, s5, 0x10012 +; GFX6-NEXT: s_bfe_u32 s64, s5, 0x10014 +; GFX6-NEXT: s_bfe_u32 s65, s5, 0x10016 +; GFX6-NEXT: s_bfe_u32 s66, s5, 0x10018 +; GFX6-NEXT: s_bfe_u32 s67, s5, 0x1001a +; GFX6-NEXT: s_bfe_u32 s68, s5, 0x1001e +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x1001c +; GFX6-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NEXT: v_mov_b32_e32 v2, s50 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 ; GFX6-NEXT: v_mov_b32_e32 v2, s47 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 ; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 ; GFX6-NEXT: v_mov_b32_e32 v2, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 ; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 ; GFX6-NEXT: v_mov_b32_e32 v2, s43 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 ; GFX6-NEXT: v_mov_b32_e32 v2, s41 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 ; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 ; GFX6-NEXT: v_mov_b32_e32 v2, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 ; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 ; GFX6-NEXT: v_mov_b32_e32 v2, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 ; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s54 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s35 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s54 ; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NEXT: v_mov_b32_e32 v0, s53 ; GFX6-NEXT: v_mov_b32_e32 v2, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s51 +; GFX6-NEXT: v_mov_b32_e32 v0, s52 ; GFX6-NEXT: v_mov_b32_e32 v2, s29 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 +; GFX6-NEXT: v_mov_b32_e32 v0, s51 ; GFX6-NEXT: v_mov_b32_e32 v2, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s33 ; GFX6-NEXT: v_mov_b32_e32 v2, s25 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s30 ; GFX6-NEXT: v_mov_b32_e32 v2, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s28 ; GFX6-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NEXT: v_mov_b32_e32 v2, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s22 ; GFX6-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -9839,481 +9835,547 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s2, s4, 30 +; GFX6-NEXT: s_lshr_b32 s12, s4, 28 +; GFX6-NEXT: s_lshr_b32 s10, s4, 26 +; GFX6-NEXT: s_mov_b32 s6, s5 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x10000 +; GFX6-NEXT: s_ashr_i32 s3, s5, 31 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v14, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s2, s5, 30 +; GFX6-NEXT: s_lshr_b32 s6, s4, 24 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s4, 30 -; GFX6-NEXT: s_lshr_b32 s38, s4, 31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 28 -; GFX6-NEXT: s_lshr_b32 s34, s4, 29 -; GFX6-NEXT: s_lshr_b32 s26, s4, 26 -; GFX6-NEXT: s_lshr_b32 s28, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s24, s4, 25 -; GFX6-NEXT: s_lshr_b32 s18, s4, 22 -; GFX6-NEXT: s_lshr_b32 s20, s4, 23 -; GFX6-NEXT: s_lshr_b32 s14, s4, 20 -; GFX6-NEXT: s_lshr_b32 s16, s4, 21 -; GFX6-NEXT: s_lshr_b32 s10, s4, 18 -; GFX6-NEXT: s_lshr_b32 s12, s4, 19 -; GFX6-NEXT: s_lshr_b32 s6, s4, 16 -; GFX6-NEXT: s_lshr_b32 s8, s4, 17 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_mov_b32 s44, s5 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s44 -; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v14, s34 -; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v18, s16 +; GFX6-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:496 +; GFX6-NEXT: s_lshr_b32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s8, s4, 29 +; GFX6-NEXT: s_lshr_b32 s10, s4, 22 +; GFX6-NEXT: s_lshr_b32 s12, s4, 20 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NEXT: v_mov_b32_e32 v8, s22 ; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 3 -; GFX6-NEXT: v_mov_b32_e32 v4, s16 -; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224 +; GFX6-NEXT: s_lshr_b32 s6, s4, 27 +; GFX6-NEXT: s_lshr_b32 s8, s4, 25 +; GFX6-NEXT: s_lshr_b32 s10, s4, 18 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NEXT: v_mov_b32_e32 v16, s22 +; GFX6-NEXT: v_mov_b32_e32 v17, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 29 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 28 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v10, s14 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 +; GFX6-NEXT: v_mov_b32_e32 v19, s17 +; GFX6-NEXT: v_mov_b32_e32 v20, s24 +; GFX6-NEXT: v_mov_b32_e32 v21, s25 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 26 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 27 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: v_mov_b32_e32 v14, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 +; GFX6-NEXT: s_lshr_b32 s6, s4, 23 +; GFX6-NEXT: s_lshr_b32 s8, s4, 21 +; GFX6-NEXT: s_lshr_b32 s10, s4, 14 +; GFX6-NEXT: s_lshr_b32 s12, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s21 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 +; GFX6-NEXT: v_mov_b32_e32 v4, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 24 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; GFX6-NEXT: v_mov_b32_e32 v18, s10 +; GFX6-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NEXT: v_mov_b32_e32 v8, s24 +; GFX6-NEXT: v_mov_b32_e32 v9, s25 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s42 -; GFX6-NEXT: v_mov_b32_e32 v9, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s36 -; GFX6-NEXT: v_mov_b32_e32 v11, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 23 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GFX6-NEXT: v_mov_b32_e32 v1, s16 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_lshr_b32 s6, s4, 19 +; GFX6-NEXT: s_lshr_b32 s8, s4, 17 +; GFX6-NEXT: s_lshr_b32 s10, s4, 10 +; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v11, s15 +; GFX6-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NEXT: v_mov_b32_e32 v13, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s30 -; GFX6-NEXT: v_mov_b32_e32 v15, s31 -; GFX6-NEXT: s_lshr_b32 s4, s5, 21 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_mov_b32_e32 v6, s20 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 +; GFX6-NEXT: v_mov_b32_e32 v15, s13 +; GFX6-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 18 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 19 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_mov_b32_e32 v10, s18 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128 +; GFX6-NEXT: s_lshr_b32 s6, s4, 15 +; GFX6-NEXT: s_lshr_b32 s8, s4, 13 +; GFX6-NEXT: s_lshr_b32 s10, s4, 6 +; GFX6-NEXT: s_lshr_b32 s12, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v19, s11 +; GFX6-NEXT: v_mov_b32_e32 v20, s22 +; GFX6-NEXT: v_mov_b32_e32 v21, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s28 -; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 17 -; GFX6-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_mov_b32_e32 v14, s14 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NEXT: v_mov_b32_e32 v4, s25 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v18, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:96 +; GFX6-NEXT: s_lshr_b32 s6, s4, 11 +; GFX6-NEXT: s_lshr_b32 s8, s4, 9 +; GFX6-NEXT: s_lshr_b32 s10, s4, 2 +; GFX6-NEXT: s_lshr_b32 s12, s5, 28 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v7, s21 +; GFX6-NEXT: v_mov_b32_e32 v8, s22 +; GFX6-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_mov_b32_e32 v11, s19 ; GFX6-NEXT: v_mov_b32_e32 v12, s24 ; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 14 -; GFX6-NEXT: v_mov_b32_e32 v14, s18 -; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 15 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 12 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v18, s14 -; GFX6-NEXT: v_mov_b32_e32 v19, s15 -; GFX6-NEXT: s_lshr_b32 s14, s5, 13 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_lshr_b32 s16, s5, 10 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_mov_b32_e32 v7, s16 +; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:64 +; GFX6-NEXT: s_lshr_b32 s6, s4, 7 +; GFX6-NEXT: s_lshr_b32 s8, s4, 5 +; GFX6-NEXT: s_lshr_b32 s10, s5, 26 +; GFX6-NEXT: s_lshr_b32 s12, s5, 24 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NEXT: v_mov_b32_e32 v16, s22 +; GFX6-NEXT: v_mov_b32_e32 v17, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s10 -; GFX6-NEXT: v_mov_b32_e32 v11, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 8 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v11, s20 +; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_mov_b32_e32 v19, s13 +; GFX6-NEXT: v_mov_b32_e32 v20, s24 +; GFX6-NEXT: v_mov_b32_e32 v21, s25 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 9 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 6 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v15, s18 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_lshr_b32 s6, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s8, s5, 22 +; GFX6-NEXT: s_lshr_b32 s10, s5, 20 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 +; GFX6-NEXT: v_mov_b32_e32 v4, s23 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 7 -; GFX6-NEXT: v_mov_b32_e32 v18, s40 -; GFX6-NEXT: v_mov_b32_e32 v19, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v19, s14 +; GFX6-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NEXT: v_mov_b32_e32 v23, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_lshr_b32 s4, s5, 29 +; GFX6-NEXT: s_lshr_b32 s6, s5, 27 +; GFX6-NEXT: s_lshr_b32 s8, s5, 25 +; GFX6-NEXT: s_lshr_b32 s10, s5, 23 +; GFX6-NEXT: s_lshr_b32 s12, s5, 21 +; GFX6-NEXT: s_lshr_b32 s14, s5, 18 +; GFX6-NEXT: s_lshr_b32 s16, s5, 19 +; GFX6-NEXT: s_lshr_b32 s18, s5, 17 +; GFX6-NEXT: s_lshr_b32 s20, s5, 16 +; GFX6-NEXT: s_lshr_b32 s22, s5, 14 +; GFX6-NEXT: s_lshr_b32 s24, s5, 15 +; GFX6-NEXT: s_lshr_b32 s26, s5, 12 +; GFX6-NEXT: s_lshr_b32 s28, s5, 13 +; GFX6-NEXT: s_lshr_b32 s30, s5, 10 +; GFX6-NEXT: s_lshr_b32 s34, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NEXT: s_lshr_b32 s36, s5, 8 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s17 +; GFX6-NEXT: s_lshr_b32 s4, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v9, s38 +; GFX6-NEXT: v_mov_b32_e32 v10, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 6 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s21 +; GFX6-NEXT: s_lshr_b32 s40, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v13, s6 +; GFX6-NEXT: v_mov_b32_e32 v14, s7 +; GFX6-NEXT: s_lshr_b32 s6, s5, 4 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s19 ; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 2 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 3 -; GFX6-NEXT: s_lshr_b32 s44, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v17, s8 +; GFX6-NEXT: v_mov_b32_e32 v18, s9 +; GFX6-NEXT: s_lshr_b32 s8, s5, 2 +; GFX6-NEXT: v_mov_b32_e32 v20, s15 +; GFX6-NEXT: s_lshr_b32 s44, s5, 3 +; GFX6-NEXT: s_lshr_b32 s46, s5, 1 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 -; GFX6-NEXT: v_mov_b32_e32 v10, s4 -; GFX6-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 -; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v1, s31 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[12:13], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v21, s10 +; GFX6-NEXT: v_mov_b32_e32 v22, s11 +; GFX6-NEXT: v_mov_b32_e32 v24, s13 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:480 +; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:432 +; GFX6-NEXT: v_mov_b32_e32 v25, s48 +; GFX6-NEXT: v_mov_b32_e32 v26, s49 +; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(5) +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 -; GFX6-NEXT: v_mov_b32_e32 v8, s44 -; GFX6-NEXT: v_mov_b32_e32 v9, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256 +; GFX6-NEXT: v_mov_b32_e32 v7, s46 +; GFX6-NEXT: v_mov_b32_e32 v8, s47 +; GFX6-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX8-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x24 +; GFX8-NEXT: ; implicit-def: $vgpr53 : SGPR spill to VGPR lane +; GFX8-NEXT: ; implicit-def: $vgpr52 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[30:31], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s3, 8 -; GFX8-NEXT: s_lshr_b32 s48, s3, 15 -; GFX8-NEXT: v_writelane_b32 v62, s0, 0 -; GFX8-NEXT: s_lshr_b32 s74, s3, 30 -; GFX8-NEXT: s_lshr_b32 s30, s3, 31 -; GFX8-NEXT: s_lshr_b32 s72, s3, 28 -; GFX8-NEXT: s_lshr_b32 s34, s3, 29 -; GFX8-NEXT: s_lshr_b32 s70, s3, 26 -; GFX8-NEXT: s_lshr_b32 s36, s3, 27 -; GFX8-NEXT: s_lshr_b32 s68, s3, 24 -; GFX8-NEXT: s_lshr_b32 s38, s3, 25 -; GFX8-NEXT: s_lshr_b32 s64, s3, 22 -; GFX8-NEXT: s_lshr_b32 s40, s3, 23 -; GFX8-NEXT: s_lshr_b32 s60, s3, 20 -; GFX8-NEXT: s_lshr_b32 s42, s3, 21 -; GFX8-NEXT: s_lshr_b32 s66, s3, 18 -; GFX8-NEXT: s_lshr_b32 s44, s3, 19 -; GFX8-NEXT: s_lshr_b32 s56, s3, 16 -; GFX8-NEXT: s_lshr_b32 s46, s3, 17 -; GFX8-NEXT: s_lshr_b32 s58, s3, 14 -; GFX8-NEXT: s_lshr_b32 s62, s3, 12 -; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: v_writelane_b32 v62, s1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 9 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: v_writelane_b32 v62, s0, 2 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX8-NEXT: s_lshr_b32 s0, s3, 28 +; GFX8-NEXT: v_writelane_b32 v53, s0, 0 +; GFX8-NEXT: v_writelane_b32 v53, s1, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 29 +; GFX8-NEXT: v_writelane_b32 v53, s0, 2 +; GFX8-NEXT: v_writelane_b32 v53, s1, 3 +; GFX8-NEXT: s_lshr_b32 s0, s3, 26 +; GFX8-NEXT: v_writelane_b32 v53, s0, 4 +; GFX8-NEXT: v_writelane_b32 v53, s1, 5 +; GFX8-NEXT: s_lshr_b32 s0, s3, 27 +; GFX8-NEXT: v_writelane_b32 v53, s0, 6 +; GFX8-NEXT: v_writelane_b32 v53, s1, 7 +; GFX8-NEXT: s_lshr_b32 s0, s3, 24 +; GFX8-NEXT: v_writelane_b32 v53, s0, 8 +; GFX8-NEXT: v_writelane_b32 v53, s1, 9 +; GFX8-NEXT: s_lshr_b32 s0, s3, 25 +; GFX8-NEXT: v_writelane_b32 v53, s0, 10 +; GFX8-NEXT: v_writelane_b32 v53, s1, 11 +; GFX8-NEXT: s_lshr_b32 s0, s3, 22 +; GFX8-NEXT: v_writelane_b32 v53, s0, 12 +; GFX8-NEXT: v_writelane_b32 v53, s1, 13 +; GFX8-NEXT: s_lshr_b32 s0, s3, 23 +; GFX8-NEXT: v_writelane_b32 v53, s0, 14 +; GFX8-NEXT: v_writelane_b32 v53, s1, 15 +; GFX8-NEXT: s_lshr_b32 s0, s3, 20 +; GFX8-NEXT: v_writelane_b32 v53, s0, 16 +; GFX8-NEXT: v_writelane_b32 v53, s1, 17 +; GFX8-NEXT: s_lshr_b32 s0, s3, 21 +; GFX8-NEXT: v_writelane_b32 v53, s0, 18 +; GFX8-NEXT: v_writelane_b32 v53, s1, 19 +; GFX8-NEXT: s_lshr_b32 s0, s3, 18 +; GFX8-NEXT: v_writelane_b32 v53, s0, 20 +; GFX8-NEXT: v_writelane_b32 v53, s1, 21 +; GFX8-NEXT: s_lshr_b32 s0, s3, 19 +; GFX8-NEXT: v_writelane_b32 v53, s0, 22 +; GFX8-NEXT: v_writelane_b32 v53, s1, 23 +; GFX8-NEXT: s_lshr_b32 s0, s3, 16 +; GFX8-NEXT: v_writelane_b32 v53, s0, 24 +; GFX8-NEXT: v_writelane_b32 v53, s1, 25 +; GFX8-NEXT: s_lshr_b32 s0, s3, 17 +; GFX8-NEXT: v_writelane_b32 v53, s0, 26 +; GFX8-NEXT: v_writelane_b32 v53, s1, 27 +; GFX8-NEXT: s_lshr_b32 s0, s3, 14 +; GFX8-NEXT: v_writelane_b32 v53, s0, 28 +; GFX8-NEXT: v_writelane_b32 v53, s1, 29 +; GFX8-NEXT: s_lshr_b32 s0, s3, 15 +; GFX8-NEXT: v_writelane_b32 v53, s0, 30 +; GFX8-NEXT: v_writelane_b32 v53, s1, 31 +; GFX8-NEXT: s_lshr_b32 s0, s3, 12 +; GFX8-NEXT: v_writelane_b32 v53, s0, 32 +; GFX8-NEXT: v_writelane_b32 v53, s1, 33 +; GFX8-NEXT: s_lshr_b32 s0, s3, 13 +; GFX8-NEXT: v_writelane_b32 v53, s0, 34 +; GFX8-NEXT: s_lshr_b32 s46, s3, 30 +; GFX8-NEXT: s_lshr_b32 s44, s3, 31 +; GFX8-NEXT: v_writelane_b32 v53, s1, 35 +; GFX8-NEXT: s_lshr_b32 s0, s3, 10 +; GFX8-NEXT: v_writelane_b32 v53, s0, 36 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v34, s48 -; GFX8-NEXT: s_lshr_b32 s48, s2, 1 -; GFX8-NEXT: s_lshr_b32 s50, s3, 13 -; GFX8-NEXT: v_writelane_b32 v62, s1, 3 -; GFX8-NEXT: s_lshr_b32 s6, s3, 6 -; GFX8-NEXT: s_lshr_b32 s10, s3, 7 -; GFX8-NEXT: s_lshr_b32 s12, s3, 4 -; GFX8-NEXT: s_lshr_b32 s14, s3, 5 -; GFX8-NEXT: s_lshr_b32 s16, s3, 2 -; GFX8-NEXT: s_lshr_b32 s18, s3, 3 -; GFX8-NEXT: s_lshr_b32 s20, s3, 1 -; GFX8-NEXT: s_mov_b32 s22, s3 -; GFX8-NEXT: s_lshr_b32 s24, s2, 30 -; GFX8-NEXT: s_lshr_b32 s26, s2, 31 -; GFX8-NEXT: s_lshr_b32 s28, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s74 -; GFX8-NEXT: v_mov_b32_e32 v12, s72 -; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v8, s68 -; GFX8-NEXT: v_mov_b32_e32 v16, s64 -; GFX8-NEXT: v_mov_b32_e32 v20, s60 -; GFX8-NEXT: v_mov_b32_e32 v24, s66 -; GFX8-NEXT: v_mov_b32_e32 v28, s56 -; GFX8-NEXT: v_mov_b32_e32 v32, s58 -; GFX8-NEXT: v_mov_b32_e32 v36, s62 -; GFX8-NEXT: s_lshr_b32 s86, s2, 29 -; GFX8-NEXT: v_mov_b32_e32 v40, s54 -; GFX8-NEXT: s_lshr_b32 s84, s2, 26 -; GFX8-NEXT: s_lshr_b32 s82, s2, 27 -; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 -; GFX8-NEXT: s_lshr_b32 s80, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s30 -; GFX8-NEXT: v_mov_b32_e32 v7, s31 -; GFX8-NEXT: s_lshr_b32 s78, s2, 25 -; GFX8-NEXT: s_lshr_b32 s76, s2, 22 -; GFX8-NEXT: v_mov_b32_e32 v14, s34 -; GFX8-NEXT: s_lshr_b32 s74, s2, 23 -; GFX8-NEXT: s_lshr_b32 s72, s2, 20 -; GFX8-NEXT: v_mov_b32_e32 v2, s36 -; GFX8-NEXT: s_lshr_b32 s70, s2, 21 -; GFX8-NEXT: s_lshr_b32 s68, s2, 18 -; GFX8-NEXT: v_mov_b32_e32 v10, s38 -; GFX8-NEXT: s_lshr_b32 s66, s2, 19 -; GFX8-NEXT: s_lshr_b32 s64, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v18, s40 -; GFX8-NEXT: s_lshr_b32 s62, s2, 17 -; GFX8-NEXT: s_lshr_b32 s60, s2, 14 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 -; GFX8-NEXT: s_lshr_b32 s58, s2, 15 -; GFX8-NEXT: s_lshr_b32 s56, s2, 12 -; GFX8-NEXT: v_mov_b32_e32 v26, s44 -; GFX8-NEXT: s_lshr_b32 s54, s2, 13 -; GFX8-NEXT: s_lshr_b32 s52, s2, 10 -; GFX8-NEXT: v_mov_b32_e32 v30, s46 -; GFX8-NEXT: s_lshr_b32 s4, s2, 11 -; GFX8-NEXT: s_lshr_b32 s0, s2, 8 -; GFX8-NEXT: s_lshr_b32 s46, s2, 9 -; GFX8-NEXT: s_lshr_b32 s44, s2, 6 -; GFX8-NEXT: s_lshr_b32 s42, s2, 7 -; GFX8-NEXT: s_lshr_b32 s40, s2, 4 -; GFX8-NEXT: s_lshr_b32 s38, s2, 5 -; GFX8-NEXT: s_lshr_b32 s36, s2, 2 -; GFX8-NEXT: s_lshr_b32 s34, s2, 3 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 -; GFX8-NEXT: v_writelane_b32 v62, s2, 4 -; GFX8-NEXT: v_writelane_b32 v62, s3, 5 -; GFX8-NEXT: v_readlane_b32 s2, v62, 2 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX8-NEXT: v_readlane_b32 s3, v62, 3 -; GFX8-NEXT: v_mov_b32_e32 v38, s50 -; GFX8-NEXT: v_mov_b32_e32 v39, s51 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 -; GFX8-NEXT: v_readlane_b32 s2, v62, 0 -; GFX8-NEXT: v_readlane_b32 s3, v62, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s75 -; GFX8-NEXT: v_mov_b32_e32 v13, s73 -; GFX8-NEXT: v_mov_b32_e32 v15, s35 -; GFX8-NEXT: v_mov_b32_e32 v1, s71 -; GFX8-NEXT: v_mov_b32_e32 v3, s37 -; GFX8-NEXT: v_mov_b32_e32 v9, s69 -; GFX8-NEXT: v_mov_b32_e32 v11, s39 -; GFX8-NEXT: v_mov_b32_e32 v17, s65 -; GFX8-NEXT: v_mov_b32_e32 v19, s41 -; GFX8-NEXT: v_mov_b32_e32 v21, s61 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v25, s67 -; GFX8-NEXT: v_mov_b32_e32 v27, s45 -; GFX8-NEXT: v_mov_b32_e32 v29, s57 -; GFX8-NEXT: v_mov_b32_e32 v31, s47 -; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v35, s49 -; GFX8-NEXT: v_mov_b32_e32 v37, s63 -; GFX8-NEXT: v_mov_b32_e32 v41, s55 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s1, 37 +; GFX8-NEXT: s_lshr_b32 vcc_lo, s3, 11 +; GFX8-NEXT: s_lshr_b32 s86, s3, 8 +; GFX8-NEXT: s_lshr_b32 s84, s3, 9 +; GFX8-NEXT: s_lshr_b32 s80, s3, 6 +; GFX8-NEXT: s_lshr_b32 s82, s3, 7 +; GFX8-NEXT: s_lshr_b32 s78, s3, 4 +; GFX8-NEXT: s_lshr_b32 s76, s3, 5 +; GFX8-NEXT: s_lshr_b32 s74, s3, 2 +; GFX8-NEXT: s_lshr_b32 s72, s3, 3 +; GFX8-NEXT: s_lshr_b32 s68, s3, 1 +; GFX8-NEXT: s_mov_b32 s70, s3 +; GFX8-NEXT: s_lshr_b32 s66, s2, 30 +; GFX8-NEXT: s_lshr_b32 s64, s2, 31 +; GFX8-NEXT: s_lshr_b32 s62, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NEXT: s_lshr_b32 s60, s2, 29 +; GFX8-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NEXT: s_lshr_b32 s58, s2, 26 +; GFX8-NEXT: s_lshr_b32 s56, s2, 27 +; GFX8-NEXT: s_lshr_b32 s54, s2, 24 +; GFX8-NEXT: s_lshr_b32 s52, s2, 25 +; GFX8-NEXT: s_lshr_b32 s50, s2, 22 +; GFX8-NEXT: s_lshr_b32 s48, s2, 23 +; GFX8-NEXT: s_lshr_b32 s46, s2, 20 +; GFX8-NEXT: s_lshr_b32 s44, s2, 21 +; GFX8-NEXT: s_lshr_b32 s42, s2, 18 +; GFX8-NEXT: s_lshr_b32 s40, s2, 19 +; GFX8-NEXT: s_lshr_b32 s38, s2, 16 +; GFX8-NEXT: s_lshr_b32 s36, s2, 17 +; GFX8-NEXT: s_lshr_b32 s34, s2, 14 +; GFX8-NEXT: s_lshr_b32 s30, s2, 15 +; GFX8-NEXT: s_lshr_b32 s26, s2, 12 +; GFX8-NEXT: s_lshr_b32 s24, s2, 13 +; GFX8-NEXT: s_lshr_b32 s22, s2, 10 +; GFX8-NEXT: s_lshr_b32 s20, s2, 11 +; GFX8-NEXT: s_lshr_b32 s18, s2, 8 +; GFX8-NEXT: s_lshr_b32 s16, s2, 9 +; GFX8-NEXT: s_lshr_b32 s14, s2, 6 +; GFX8-NEXT: s_lshr_b32 s12, s2, 7 +; GFX8-NEXT: s_lshr_b32 s10, s2, 4 +; GFX8-NEXT: s_lshr_b32 s8, s2, 5 +; GFX8-NEXT: s_lshr_b32 s6, s2, 2 +; GFX8-NEXT: s_lshr_b32 s4, s2, 3 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s2, 38 +; GFX8-NEXT: v_writelane_b32 v53, s3, 39 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 40 +; GFX8-NEXT: v_writelane_b32 v53, s1, 41 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[4:5], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 42 +; GFX8-NEXT: v_writelane_b32 v53, s1, 43 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 44 +; GFX8-NEXT: v_writelane_b32 v53, s1, 45 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 46 +; GFX8-NEXT: v_writelane_b32 v53, s1, 47 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 48 +; GFX8-NEXT: v_writelane_b32 v53, s1, 49 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 50 +; GFX8-NEXT: v_writelane_b32 v53, s1, 51 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[14:15], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 52 +; GFX8-NEXT: v_writelane_b32 v53, s1, 53 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[16:17], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 54 +; GFX8-NEXT: v_writelane_b32 v53, s1, 55 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 56 +; GFX8-NEXT: v_writelane_b32 v53, s1, 57 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[20:21], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 58 +; GFX8-NEXT: v_writelane_b32 v53, s1, 59 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[22:23], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 60 +; GFX8-NEXT: v_writelane_b32 v53, s1, 61 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x10000 +; GFX8-NEXT: v_writelane_b32 v53, s0, 62 +; GFX8-NEXT: v_writelane_b32 v53, s1, 63 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[26:27], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 0 +; GFX8-NEXT: v_writelane_b32 v52, s1, 1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[30:31], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 2 +; GFX8-NEXT: v_writelane_b32 v52, s1, 3 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[34:35], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 4 +; GFX8-NEXT: v_writelane_b32 v52, s1, 5 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[36:37], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 6 +; GFX8-NEXT: v_writelane_b32 v52, s1, 7 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[38:39], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 8 +; GFX8-NEXT: v_writelane_b32 v52, s1, 9 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[40:41], 0x10000 +; GFX8-NEXT: v_writelane_b32 v52, s0, 10 +; GFX8-NEXT: v_writelane_b32 v52, s1, 11 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[42:43], 0x10000 +; GFX8-NEXT: v_readlane_b32 s8, v53, 36 +; GFX8-NEXT: v_readlane_b32 s12, v53, 32 +; GFX8-NEXT: v_readlane_b32 s16, v53, 28 +; GFX8-NEXT: v_readlane_b32 s20, v53, 24 +; GFX8-NEXT: v_readlane_b32 s24, v53, 20 +; GFX8-NEXT: v_writelane_b32 v52, s0, 12 +; GFX8-NEXT: v_readlane_b32 s9, v53, 37 +; GFX8-NEXT: v_readlane_b32 s13, v53, 33 +; GFX8-NEXT: v_readlane_b32 s17, v53, 29 +; GFX8-NEXT: v_readlane_b32 s21, v53, 25 +; GFX8-NEXT: v_readlane_b32 s25, v53, 21 +; GFX8-NEXT: v_writelane_b32 v52, s1, 13 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[80:81], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[84:85], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[86:87], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[8:9], 0x10000 +; GFX8-NEXT: v_readlane_b32 s8, v53, 34 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x10000 +; GFX8-NEXT: v_readlane_b32 s12, v53, 30 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[16:17], 0x10000 +; GFX8-NEXT: v_readlane_b32 s16, v53, 26 +; GFX8-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x10000 +; GFX8-NEXT: v_readlane_b32 s20, v53, 22 +; GFX8-NEXT: s_bfe_i64 s[26:27], s[24:25], 0x10000 +; GFX8-NEXT: v_readlane_b32 s24, v53, 18 +; GFX8-NEXT: v_readlane_b32 s30, v53, 16 +; GFX8-NEXT: v_readlane_b32 s34, v53, 14 +; GFX8-NEXT: v_readlane_b32 s36, v53, 12 +; GFX8-NEXT: v_readlane_b32 s38, v53, 10 +; GFX8-NEXT: v_readlane_b32 s40, v53, 8 +; GFX8-NEXT: v_readlane_b32 s42, v53, 6 +; GFX8-NEXT: v_readlane_b32 s80, v53, 4 +; GFX8-NEXT: v_readlane_b32 s84, v53, 2 +; GFX8-NEXT: v_readlane_b32 s86, v53, 0 +; GFX8-NEXT: v_readlane_b32 s9, v53, 35 +; GFX8-NEXT: v_readlane_b32 s13, v53, 31 +; GFX8-NEXT: v_readlane_b32 s17, v53, 27 +; GFX8-NEXT: v_readlane_b32 s21, v53, 23 +; GFX8-NEXT: v_readlane_b32 s25, v53, 19 +; GFX8-NEXT: v_readlane_b32 s31, v53, 17 +; GFX8-NEXT: v_readlane_b32 s35, v53, 15 +; GFX8-NEXT: v_readlane_b32 s37, v53, 13 +; GFX8-NEXT: v_readlane_b32 s39, v53, 11 +; GFX8-NEXT: v_readlane_b32 s41, v53, 9 +; GFX8-NEXT: v_readlane_b32 s43, v53, 7 +; GFX8-NEXT: v_readlane_b32 s81, v53, 5 +; GFX8-NEXT: v_readlane_b32 s85, v53, 3 +; GFX8-NEXT: v_readlane_b32 s87, v53, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -10322,272 +10384,358 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], vcc, 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v43, s3 -; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 -; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v47, s3 -; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v49, s3 -; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v51, s3 -; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v53, s3 -; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v55, s3 -; GFX8-NEXT: v_mov_b32_e32 v54, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v57, s3 -; GFX8-NEXT: v_mov_b32_e32 v56, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v59, s3 -; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v61, s3 -; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x130 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x120 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s1 -; GFX8-NEXT: v_mov_b32_e32 v18, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo -; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] -; GFX8-NEXT: v_mov_b32_e32 v10, s14 -; GFX8-NEXT: v_mov_b32_e32 v11, s15 -; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] -; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35] -; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[40:43] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GFX8-NEXT: s_add_u32 vcc_lo, s28, 0x1f0 +; GFX8-NEXT: s_addc_u32 vcc_hi, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v22, s84 +; GFX8-NEXT: s_add_u32 s84, s28, 0x1e0 +; GFX8-NEXT: v_mov_b32_e32 v23, s85 +; GFX8-NEXT: s_addc_u32 s85, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v32, s42 +; GFX8-NEXT: s_add_u32 s42, s28, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v33, s43 +; GFX8-NEXT: s_addc_u32 s43, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v36, s38 +; GFX8-NEXT: s_add_u32 s38, s28, 0x1c0 +; GFX8-NEXT: v_mov_b32_e32 v37, s39 +; GFX8-NEXT: s_addc_u32 s39, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v26, s34 +; GFX8-NEXT: s_add_u32 s34, s28, 0x1b0 +; GFX8-NEXT: v_mov_b32_e32 v27, s35 +; GFX8-NEXT: s_addc_u32 s35, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v14, s24 +; GFX8-NEXT: s_add_u32 s24, s28, 0x1a0 +; GFX8-NEXT: v_mov_b32_e32 v15, s25 +; GFX8-NEXT: s_addc_u32 s25, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s20 +; GFX8-NEXT: s_add_u32 s20, s28, 0x190 +; GFX8-NEXT: v_mov_b32_e32 v19, s21 +; GFX8-NEXT: s_addc_u32 s21, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: s_add_u32 s16, s28, 0x180 +; GFX8-NEXT: v_mov_b32_e32 v11, s17 +; GFX8-NEXT: s_addc_u32 s17, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v51, s17 +; GFX8-NEXT: v_mov_b32_e32 v38, s84 +; GFX8-NEXT: v_mov_b32_e32 v50, s16 +; GFX8-NEXT: s_add_u32 s16, s28, 0x170 +; GFX8-NEXT: v_mov_b32_e32 v20, s86 +; GFX8-NEXT: v_mov_b32_e32 v21, s87 +; GFX8-NEXT: v_mov_b32_e32 v39, s85 +; GFX8-NEXT: s_addc_u32 s17, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v45, s35 +; GFX8-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v24, s36 +; GFX8-NEXT: v_mov_b32_e32 v22, s8 +; GFX8-NEXT: s_add_u32 s8, s28, 0x160 +; GFX8-NEXT: v_mov_b32_e32 v25, s37 +; GFX8-NEXT: v_mov_b32_e32 v44, s34 +; GFX8-NEXT: v_mov_b32_e32 v23, s9 +; GFX8-NEXT: s_addc_u32 s9, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v40, s42 +; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[24:27] +; GFX8-NEXT: v_mov_b32_e32 v30, s80 +; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: s_add_u32 s2, s28, 0x150 +; GFX8-NEXT: v_mov_b32_e32 v31, s81 +; GFX8-NEXT: v_mov_b32_e32 v41, s43 +; GFX8-NEXT: v_mov_b32_e32 v25, s3 +; GFX8-NEXT: s_addc_u32 s3, s29, 0 +; GFX8-NEXT: flat_store_dwordx4 v[40:41], v[30:33] +; GFX8-NEXT: v_mov_b32_e32 v47, s25 +; GFX8-NEXT: v_mov_b32_e32 v33, s3 +; GFX8-NEXT: v_mov_b32_e32 v32, s2 +; GFX8-NEXT: s_add_u32 s2, s28, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v5, vcc_lo +; GFX8-NEXT: v_mov_b32_e32 v12, s30 +; GFX8-NEXT: v_mov_b32_e32 v13, s31 +; GFX8-NEXT: v_mov_b32_e32 v46, s24 +; GFX8-NEXT: s_addc_u32 s3, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, vcc_hi +; GFX8-NEXT: v_mov_b32_e32 v29, s17 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[12:15] +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x130 +; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: v_mov_b32_e32 v28, s16 +; GFX8-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 +; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v43, s39 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x100 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x120 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: v_mov_b32_e32 v34, s40 +; GFX8-NEXT: v_mov_b32_e32 v35, s41 +; GFX8-NEXT: v_mov_b32_e32 v42, s38 +; GFX8-NEXT: v_mov_b32_e32 v49, s21 +; GFX8-NEXT: v_mov_b32_e32 v31, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v16, s26 +; GFX8-NEXT: v_mov_b32_e32 v17, s27 +; GFX8-NEXT: v_mov_b32_e32 v48, s20 +; GFX8-NEXT: v_mov_b32_e32 v8, s22 +; GFX8-NEXT: v_mov_b32_e32 v9, s23 +; GFX8-NEXT: v_mov_b32_e32 v20, s14 +; GFX8-NEXT: v_mov_b32_e32 v21, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[34:37] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v30, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v35, s3 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 +; GFX8-NEXT: v_mov_b32_e32 v34, s2 +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v26, s6 +; GFX8-NEXT: v_mov_b32_e32 v27, s7 +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v14, s82 +; GFX8-NEXT: v_mov_b32_e32 v8, s78 +; GFX8-NEXT: v_mov_b32_e32 v15, s83 +; GFX8-NEXT: v_mov_b32_e32 v9, s79 +; GFX8-NEXT: flat_store_dwordx4 v[30:31], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v10, s76 +; GFX8-NEXT: v_mov_b32_e32 v11, s77 +; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[24:27] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v0, s74 +; GFX8-NEXT: v_mov_b32_e32 v1, s75 +; GFX8-NEXT: v_mov_b32_e32 v2, s72 +; GFX8-NEXT: v_mov_b32_e32 v3, s73 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 -; GFX8-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s70 +; GFX8-NEXT: v_mov_b32_e32 v1, s71 +; GFX8-NEXT: v_mov_b32_e32 v2, s68 +; GFX8-NEXT: v_mov_b32_e32 v3, s69 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 -; GFX8-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xe0 +; GFX8-NEXT: v_mov_b32_e32 v0, s66 +; GFX8-NEXT: v_mov_b32_e32 v1, s67 +; GFX8-NEXT: v_mov_b32_e32 v2, s64 +; GFX8-NEXT: v_mov_b32_e32 v3, s65 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NEXT: v_mov_b32_e32 v1, s29 -; GFX8-NEXT: v_mov_b32_e32 v2, s86 -; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v0, s84 -; GFX8-NEXT: v_mov_b32_e32 v1, s85 -; GFX8-NEXT: v_mov_b32_e32 v2, s82 -; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v0, s80 -; GFX8-NEXT: v_mov_b32_e32 v1, s81 -; GFX8-NEXT: v_mov_b32_e32 v2, s78 -; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v0, s54 +; GFX8-NEXT: v_mov_b32_e32 v1, s55 +; GFX8-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 -; GFX8-NEXT: v_mov_b32_e32 v0, s76 -; GFX8-NEXT: v_mov_b32_e32 v1, s77 -; GFX8-NEXT: v_mov_b32_e32 v2, s74 -; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s28, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v0, s72 -; GFX8-NEXT: v_mov_b32_e32 v1, s73 -; GFX8-NEXT: v_mov_b32_e32 v2, s70 -; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v52, 12 +; GFX8-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NEXT: v_readlane_b32 s1, v52, 13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v52, 10 +; GFX8-NEXT: v_readlane_b32 s1, v52, 11 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v0, s68 -; GFX8-NEXT: v_mov_b32_e32 v1, s69 -; GFX8-NEXT: v_mov_b32_e32 v2, s66 -; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v52, 8 +; GFX8-NEXT: v_readlane_b32 s1, v52, 9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v52, 6 +; GFX8-NEXT: v_readlane_b32 s1, v52, 7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x70 -; GFX8-NEXT: v_mov_b32_e32 v0, s64 -; GFX8-NEXT: v_mov_b32_e32 v1, s65 -; GFX8-NEXT: v_mov_b32_e32 v2, s62 -; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v52, 4 +; GFX8-NEXT: v_readlane_b32 s1, v52, 5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v52, 2 +; GFX8-NEXT: v_readlane_b32 s1, v52, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v0, s60 -; GFX8-NEXT: v_mov_b32_e32 v1, s61 -; GFX8-NEXT: v_mov_b32_e32 v2, s58 -; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v52, 0 +; GFX8-NEXT: v_readlane_b32 s1, v52, 1 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 62 +; GFX8-NEXT: v_readlane_b32 s1, v53, 63 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v0, s56 -; GFX8-NEXT: v_mov_b32_e32 v1, s57 -; GFX8-NEXT: v_mov_b32_e32 v2, s54 -; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v53, 60 +; GFX8-NEXT: v_readlane_b32 s1, v53, 61 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 58 +; GFX8-NEXT: v_readlane_b32 s1, v53, 59 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 64 -; GFX8-NEXT: v_mov_b32_e32 v0, s52 -; GFX8-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NEXT: v_mov_b32_e32 v2, s50 -; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v53, 56 +; GFX8-NEXT: v_readlane_b32 s1, v53, 57 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 54 +; GFX8-NEXT: v_readlane_b32 s1, v53, 55 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 64 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NEXT: v_mov_b32_e32 v0, s48 -; GFX8-NEXT: v_mov_b32_e32 v1, s49 -; GFX8-NEXT: v_mov_b32_e32 v2, s46 -; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v53, 52 +; GFX8-NEXT: v_readlane_b32 s1, v53, 53 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 50 +; GFX8-NEXT: v_readlane_b32 s1, v53, 51 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 48 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NEXT: v_mov_b32_e32 v1, s45 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v53, 48 +; GFX8-NEXT: v_readlane_b32 s1, v53, 49 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 46 +; GFX8-NEXT: v_readlane_b32 s1, v53, 47 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 32 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s40 -; GFX8-NEXT: v_mov_b32_e32 v1, s41 -; GFX8-NEXT: v_mov_b32_e32 v2, s38 -; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_readlane_b32 s0, v53, 44 +; GFX8-NEXT: v_readlane_b32 s1, v53, 45 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 42 +; GFX8-NEXT: v_readlane_b32 s1, v53, 43 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s28, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s29, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v62, 4 +; GFX8-NEXT: v_readlane_b32 s0, v53, 38 +; GFX8-NEXT: v_readlane_b32 s1, v53, 39 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v62, 5 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v53, 40 +; GFX8-NEXT: v_readlane_b32 s1, v53, 41 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a135b43bad0fe..f440c144555c8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -663,82 +663,82 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26 -; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v19, v[6:7] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18 -; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[0:1] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[2:3] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[4:5] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[6:7] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[0:1] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18 +; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[2:3] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[4:5] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: flat_load_ushort v25, v[6:7] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v26, v[8:9] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[10:11] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[12:13] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[14:15] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[16:17] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[18:19] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[20:21] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[2:3] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[4:5] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[6:7] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[2:3] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[6:7] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9] ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v17, v1 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v23, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(13) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v19, v1 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v25, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(11) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v9, v1 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v10, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v11, v4 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v12, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v4 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v14, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v15, v4 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v0, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v20, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v4, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v8, v0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2741,24 +2741,24 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -3211,28 +3211,28 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[4:7] ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 @@ -3669,29 +3669,28 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 @@ -3774,9 +3773,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s44, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s44, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff @@ -3786,24 +3785,24 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s41, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s43, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s51, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s53, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s50, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s58, s14, 0xffff ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s18, s1, 16 ; GCN-HSA-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s53, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s55, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s57, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s51, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s52, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s54, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s55, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s57, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s59, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s61, s11, 16 @@ -3812,117 +3811,117 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s64, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s65, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s66, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s67, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s68, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s67, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s68, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s69, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s15, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s63 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v40, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[26:29] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[22:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s56 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s69 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s44 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[30:33] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[34:37] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[38:41] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 @@ -3932,9 +3931,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4004,6 +4003,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff @@ -4020,7 +4020,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16 @@ -4588,29 +4587,28 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s57 @@ -4706,142 +4704,142 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10 ; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s47, s12, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s46, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s49, s12 +; GCN-HSA-NEXT: s_ashr_i32 s46, s12, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s47, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s51, s12 ; GCN-HSA-NEXT: s_ashr_i32 s48, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14 +; GCN-HSA-NEXT: s_sext_i32_i16 s49, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s55, s14 ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_ashr_i32 s53, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s54, s2, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s60, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s61, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s62, s8, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s64, s10, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s57, s0 +; GCN-HSA-NEXT: s_ashr_i32 s0, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s52, s2, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s58, s2 +; GCN-HSA-NEXT: s_ashr_i32 s2, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s4, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s59, s4 +; GCN-HSA-NEXT: s_ashr_i32 s4, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s6, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s60, s6 +; GCN-HSA-NEXT: s_ashr_i32 s6, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s56, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s62, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s65, s13, 16 ; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16 ; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s56, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: s_sext_i32_i16 s64, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xf0 +; GCN-HSA-NEXT: s_sext_i32_i16 s63, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x70 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[26:29] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[22:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s4 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v40, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[30:33] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[34:37] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[38:41] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 @@ -7253,10 +7251,10 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-HSA-NEXT: s_mov_b32 s14, s5 ; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s18, s3 ; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_mov_b32 s18, s3 ; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 ; GCN-HSA-NEXT: s_mov_b32 s22, s1 ; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 @@ -8311,152 +8309,151 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s11 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[16:19], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s20 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[16:19], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s7 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[16:19], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[16:19], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s3 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s5, 31 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s9 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s1 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s1, 31 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s1, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[16:19], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[16:19], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8468,159 +8465,159 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s24, s15 -; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 -; GCN-HSA-NEXT: s_mov_b32 s52, s11 -; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-HSA-NEXT: s_mov_b32 s30, s9 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s58, s5 -; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 -; GCN-HSA-NEXT: s_mov_b32 s62, s3 -; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s66, s1 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s28, s15 +; GCN-HSA-NEXT: s_ashr_i32 s45, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s46, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s14, 16 +; GCN-HSA-NEXT: s_mov_b32 s58, s13 +; GCN-HSA-NEXT: s_lshr_b32 s42, s12, 16 +; GCN-HSA-NEXT: s_mov_b32 s38, s11 +; GCN-HSA-NEXT: s_lshr_b32 s36, s10, 16 +; GCN-HSA-NEXT: s_mov_b32 s34, s9 +; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16 +; GCN-HSA-NEXT: s_mov_b32 s60, s7 +; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s64, s5 +; GCN-HSA-NEXT: s_lshr_b32 s66, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s68, s3 +; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s72, s1 +; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[28:29], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s44, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s47, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s48, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s49, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s50, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s51, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s52, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s54, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s55, s13, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[76:77], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[74:75], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[60:61], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s60, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s61, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s56 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s57 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s60 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s59 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s55 +; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s76 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s77 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: s_add_u32 s22, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: s_addc_u32 s23, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s41 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[18:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s22 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[22:25] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[26:29] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[30:33] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8639,8 +8636,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8659,7 +8656,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6f7ee70812264..7f3ced41e167e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -3064,68 +3064,66 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s18, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s0, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s2, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s4, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s15, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s14, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s36, s12, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s13 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 +; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s18, -1 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s13, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s15, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s14, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s20 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s30 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s2, s2, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s9, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s8, s8, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s11, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s10, s10, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s6, s6, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: @@ -3155,28 +3153,35 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s30 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -3184,7 +3189,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 @@ -3194,7 +3199,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -3203,25 +3208,18 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -3967,136 +3965,136 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s38, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s36, s0 -; GFX6-NOHSA-NEXT: s_mov_b32 s37, s1 -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s17, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s16, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s19, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s18, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s21, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s20, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s30, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s31, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s28, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s29, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s23, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s44 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s22, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX6-NOHSA-NEXT: s_mov_b32 s36, s16 +; GFX6-NOHSA-NEXT: s_mov_b32 s37, s17 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x0 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s13, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s15, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s14, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s16 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s25, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s27, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s26, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s24, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s9, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s11, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s10, s10, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s8, s8, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s2, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s4, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s12, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s22 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s15, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s14, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NOHSA-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s9, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s6, s6, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s9 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:160 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s2, s2, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s5 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s33 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s30 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 +; GFX6-NOHSA-NEXT: s_ashr_i32 s0, s29, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s1, s31, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s2, s30, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s3, s28, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s0 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s26 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: s_ashr_i32 s0, s17, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s1, s16, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s2, s19, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s3, s18, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s21, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s5, s20, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s6, s23, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s7, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s8, s24, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s9, s27, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s10, s26, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s22, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s4 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: @@ -4108,53 +4106,53 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s20, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s22, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s6, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s38, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s39, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s42, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s14, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GFX7-HSA-NEXT: s_ashr_i32 s24, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s25, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s38, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s39, s8, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s10, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s44, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s14, 31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s41 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s44, s6, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s20, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s21, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s22, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s23, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s8, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s47, s11, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s48, s10, 31 @@ -4164,127 +4162,125 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s52, s14, 31 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[21:24] ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 -; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[18:21] +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s36 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[7:10] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s30 +; GFX7-HSA-NEXT: s_add_u32 s30, s16, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s29 +; GFX7-HSA-NEXT: s_addc_u32 s31, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[11:14] +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[24:27] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[7:10] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -4649,128 +4645,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; EG-NEXT: MOV * T32.Z, T12.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: -; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 -; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_ashr_i32 s58, s30, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s31, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s28, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s29, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s26, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s27, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s24, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v11, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s25, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v13, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 -; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[4:7], s[36:37] offset:224 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57 -; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176 -; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s55 -; GFX9-HSA-NEXT: s_ashr_i32 s52, s16, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:160 -; GFX9-HSA-NEXT: s_ashr_i32 s49, s15, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s53 -; GFX9-HSA-NEXT: s_ashr_i32 s50, s14, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:144 -; GFX9-HSA-NEXT: s_ashr_i32 s47, s13, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s51 -; GFX9-HSA-NEXT: s_ashr_i32 s48, s12, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:128 -; GFX9-HSA-NEXT: s_ashr_i32 s45, s11, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s49 -; GFX9-HSA-NEXT: s_ashr_i32 s46, s10, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:112 -; GFX9-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s47 -; GFX9-HSA-NEXT: s_ashr_i32 s44, s8, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:96 -; GFX9-HSA-NEXT: s_ashr_i32 s41, s7, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s45 -; GFX9-HSA-NEXT: s_ashr_i32 s42, s6, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:80 -; GFX9-HSA-NEXT: s_ashr_i32 s39, s5, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s43 -; GFX9-HSA-NEXT: s_ashr_i32 s40, s4, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:64 -; GFX9-HSA-NEXT: s_ashr_i32 s35, s3, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s41 -; GFX9-HSA-NEXT: s_ashr_i32 s38, s2, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:48 -; GFX9-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39 -; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s26 -; GFX9-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[7:10], s[36:37] offset:208 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-HSA-NEXT: v_mov_b32_e32 v12, s25 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[10:13], s[36:37] offset:192 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s33 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] -; GFX9-HSA-NEXT: s_endpgm -; ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b534c2c267fad..4d2c3c2bbebac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2509,24 +2509,24 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -2986,26 +2986,26 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[4:7] ; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -3452,29 +3452,28 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s65 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s10 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(5) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 @@ -3558,19 +3557,19 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s37, s6, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s37, s7, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s43, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s46, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s47, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s48, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s49, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s50, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s51, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s52, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s8, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s41, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s43, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s51, s12, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s53, s13, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s54, s13, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s55, s14, 24 @@ -3585,124 +3584,124 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s36, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s38, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s45, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s42, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s49, s6, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s45, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s52, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s59, s8, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s60, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s61, s9, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s62, s9, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s63, s10, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s60, s9, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s61, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s64, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s62, s11, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s65, s12, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s66, s13, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s63, s12, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s64, s12, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s65, s13, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s66, s13, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s67, s14, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x80 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[18:21] +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s57 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[14:17] +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v40, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s62 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[22:25] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[26:29] +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v38, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v37, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v39, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v41, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[30:33] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[34:37] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[38:41] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s35 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 @@ -3713,7 +3712,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 @@ -4371,29 +4370,28 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s66 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s65 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s61 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s57 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s54 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s53 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(5) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s51 @@ -4485,138 +4483,138 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 +; GFX7-HSA-NEXT: s_sext_i32_i8 s44, s6 ; GFX7-HSA-NEXT: s_ashr_i32 s6, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s44, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s45, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s48, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s51, s10, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s52, s10, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s53, s10, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s54, s11, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s55, s11, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s56, s11, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s57, s12, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s58, s12, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s59, s12, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s60, s13, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s61, s13, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s62, s13, 0x80008 +; GFX7-HSA-NEXT: s_bfe_i32 s40, s7, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80008 +; GFX7-HSA-NEXT: s_sext_i32_i8 s51, s7 +; GFX7-HSA-NEXT: s_ashr_i32 s7, s8, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s42, s8, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s43, s8, 0x80008 +; GFX7-HSA-NEXT: s_sext_i32_i8 s54, s8 +; GFX7-HSA-NEXT: s_ashr_i32 s8, s9, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s45, s9, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s46, s9, 0x80008 +; GFX7-HSA-NEXT: s_sext_i32_i8 s55, s9 +; GFX7-HSA-NEXT: s_ashr_i32 s9, s10, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s47, s10, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s48, s10, 0x80008 +; GFX7-HSA-NEXT: s_sext_i32_i8 s56, s10 +; GFX7-HSA-NEXT: s_ashr_i32 s10, s11, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s49, s11, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s50, s11, 0x80008 +; GFX7-HSA-NEXT: s_sext_i32_i8 s57, s11 +; GFX7-HSA-NEXT: s_ashr_i32 s11, s12, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s52, s12, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s53, s12, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s59, s13, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s60, s13, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s61, s13, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s63, s14, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s64, s14, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s65, s14, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s66, s15, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s67, s15, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s68, s15, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s46, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: s_sext_i32_i8 s58, s12 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xf0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s62, s13 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x90 +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x80 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[18:21] +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v41, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x60 ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v37, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s57 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[22:25] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[26:29] +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v38, s55 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[14:17] +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v39, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v40, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[30:33] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[34:37] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[38:41] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s37 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[6:9] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 @@ -7027,87 +7025,85 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s3 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s7, s1, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s3, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s14 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s0, 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s1 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s18 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:64 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s1, 8 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s3, 31 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s11 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[21:24], off, s[4:7], 0 offset:16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7128,9 +7124,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s22, s7 ; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s22, s7 ; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 ; GFX7-HSA-NEXT: s_mov_b32 s28, s5 @@ -8206,157 +8202,159 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 -; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 -; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 -; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s22 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s7 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[17:20], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s23 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[17:20], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s18 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:128 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s25 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[17:20], off, s[8:11], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s4, s3 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s3, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s4, s1 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s4, s1, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s7, 31 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s19 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s5, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s1, 8 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 31 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[9:12], off, s[8:11], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[13:16], off, s[8:11], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[8:11], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8371,168 +8369,167 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s68, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s53, s1, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s70, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s26, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s20, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s64, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s54, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s55, s3, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s56, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s57, s5, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s58, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s59, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 ; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 ; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 64 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 16 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v40, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v41, s61 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s36 +; GFX7-HSA-NEXT: s_add_u32 s36, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v42, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s47 +; GFX7-HSA-NEXT: s_addc_u32 s37, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v43, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s36 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s58 +; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v37, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v38, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v39, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[32:35] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[40:41], v[36:39] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8541,8 +8538,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8561,7 +8558,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s53 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index c119ef274bb04..cb617e770c609 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -706,12 +706,12 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 @@ -719,26 +719,25 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_load_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 09d3c3b01b809..988b53f9ce730 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -2216,40 +2216,40 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: @@ -2666,47 +2666,47 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 @@ -2724,27 +2724,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 @@ -2753,64 +2753,64 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[21:24] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[22:25] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v13 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -2831,45 +2831,46 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -3080,48 +3081,48 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v2 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3130,13 +3131,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -3153,78 +3154,78 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v19, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[17:20] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v20, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v18, v8, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[18:21] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v14, v11, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v21, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[19:22] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[33:34], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3245,44 +3246,45 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -3531,107 +3533,106 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[43:46], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(13) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(11) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v33 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v33 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v61, 0xffff, v32 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v59, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v38 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v39 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v42, 16, v46 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v45 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v46 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v39, 0xffff, v45 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload @@ -3657,191 +3658,190 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: s_add_u32 s12, s2, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: s_addc_u32 s13, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15] +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[32:35] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x80 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v26 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[32:35] -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v42, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v40, s5 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v35, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s4 +; GCN-HSA-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[33:36] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[32:35] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v28 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v28 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; GCN-HSA-NEXT: flat_store_dwordx4 v[41:42], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v33, 0xffff, v31 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v31, 0xffff, v30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v24 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[31:34] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[39:40], v[27:30] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[1:4] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: @@ -3863,10 +3863,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[43:46], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 @@ -3875,90 +3875,89 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v22, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v23, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v11 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v33 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v32 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v31 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v38 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v35 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v40 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v46 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v45 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v46 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v45 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload @@ -4351,12 +4350,12 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) @@ -4373,19 +4372,19 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v16, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v39 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v38 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v39, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v38, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v37 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 @@ -4399,51 +4398,51 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v35 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v34 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v35, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v34, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v32 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v33, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v34, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v32, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v31, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v28, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v24, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v20, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 @@ -4469,13 +4468,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -4489,11 +4488,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -4502,8 +4501,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v28 @@ -4513,142 +4512,143 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 16, v31 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v33, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v31, v30, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[31:34] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[32:35] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v42, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v35, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v33, v30, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[33:36] +; GCN-HSA-NEXT: v_mov_b32_e32 v40, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v30, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v24, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 16, v27 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[28:31] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 16, v26 +; GCN-HSA-NEXT: v_bfe_i32 v31, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v29, v26, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[39:40], v[29:32] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v26, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[41:42], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v27, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v22, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[33:34], v[25:28] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v28, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v26, v4, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[26:29] +; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v23, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v0, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v24, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v22, v2, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v13 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v29, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v27, v8, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[22:25] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[27:30] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v1, v14, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v25, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v23, v12, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v16, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[5:8] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v30, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v18, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 -; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26 -; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v15 +; GCN-HSA-NEXT: v_bfe_i32 v3, v15, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: @@ -4670,10 +4670,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[43:46], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 @@ -4682,91 +4682,92 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v13 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v19 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v9 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v8, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v44 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v43 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v44, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v43, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v34 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v33 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v34, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v33, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v32 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v32, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v36 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v35 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v36, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v35, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v38, 16, v42 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v36, 16, v41 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v37, v42, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v35, v41, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v40 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v39 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v40, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v39, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v46 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v45 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v46, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v45, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload @@ -6399,42 +6400,42 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v4, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: @@ -6601,17 +6602,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5 @@ -6622,24 +6623,24 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: @@ -6653,82 +6654,82 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, v8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[14:17] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s5 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[10:13] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6743,47 +6744,47 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 @@ -6975,154 +6976,152 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v3 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_bfe_i32 v10, v2, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[13:16] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] +; GCN-HSA-NEXT: v_bfe_i32 v9, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v7 +; GCN-HSA-NEXT: v_bfe_i32 v11, v22, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v4, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s1 +; GCN-HSA-NEXT: v_bfe_i32 v5, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[15:18] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -7140,56 +7139,56 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -7384,128 +7383,130 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v15 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v18 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v25 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v22 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v23 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v28 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v43 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v43 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v43 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v43 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v43 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v43 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: @@ -7524,15 +7525,15 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 @@ -7547,122 +7548,131 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v26, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[23:26] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v26, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[21:24] ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[24:27] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[21:24] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_endpgm ; @@ -7676,96 +7686,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[21:24], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[25:28], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v32 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v25 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -8112,115 +8121,113 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v31, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: @@ -8241,8 +8248,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8250,161 +8257,161 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v18, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[18:21] +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_bfe_i32 v13, v26, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] +; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_bfe_i32 v13, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: v_bfe_i32 v9, v29, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_bfe_i32 v17, v28, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[9:12] +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v30, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_bfe_i32 v17, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v14, v32, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v21, v23, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v25, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; GCN-HSA-NEXT: v_bfe_i32 v11, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v29, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: v_bfe_i32 v31, v34, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 -; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[29:32] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v19, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v13, v33, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[25:28] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: @@ -8417,116 +8424,115 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v35, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 31, v33 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v36, 31, v35 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index e55fb2cac0985..a99aeadafb009 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2375,74 +2375,74 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOHSA-NEXT: s_mov_b32 s8, s6 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v1 -; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2 -; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 ; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v4 ; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5 -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11 +; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v1 +; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13 -; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 +; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v2, 31, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v1, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v10 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v11 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v5, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -2454,78 +2454,78 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v15 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[21:22], v[16:19] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v9 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v4 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v10 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[23:24], v[17:20] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v6 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[27:28], v[13:16] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v7 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[29:30], v[9:12] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[31:32], v[17:20] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: @@ -2538,56 +2538,56 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v10 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v5 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v5 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v14 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v15 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v15 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v13 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v2, 31, v14 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v13 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v15 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i32_to_v16i64: @@ -2678,60 +2678,115 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; EG-NEXT: MOV * T16.Z, T1.Y, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: -; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 -; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 -; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 -; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 -; GCN-HSA-NEXT: s_endpgm +; GCN-GFX900-HSA-LABEL: global_sextload_v16i32_to_v16i64: +; GCN-GFX900-HSA: ; %bb.0: +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v40, 0 +; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v40, s[2:3] offset:32 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v40, s[2:3] offset:48 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v40, s[2:3] offset:16 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[12:15], v40, s[2:3] +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v24, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v26, v5 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v1 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v20, v6 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v22, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v1 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v16, v2 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v9 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v18, v3 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v10 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v8 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v32, v8 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v34, v9 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v1, v10 +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v15 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v3, v11 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v14 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v39, 31, v13 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v37, 31, v12 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v36, v12 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v38, v13 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v5, v14 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v7, v15 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:96 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:112 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:64 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:80 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[1:4], s[0:1] offset:48 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[36:39], s[0:1] +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v40, v[5:8], s[0:1] offset:16 +; GCN-GFX900-HSA-NEXT: s_endpgm +; +; GCN-GFX908-HSA-LABEL: global_sextload_v16i32_to_v16i64: +; GCN-GFX908-HSA: ; %bb.0: +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v36, 0 +; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 +; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v24, v4 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v26, v5 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v20, v6 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v22, v7 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v0 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v1 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v16, v2 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v18, v3 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v28, v8 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v30, v9 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v10 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 +; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v32, v12 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v34, v13 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v14 +; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v15 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 +; GCN-GFX908-HSA-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(1) %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -2828,25 +2883,25 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5 @@ -2856,17 +2911,17 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 @@ -3075,114 +3130,123 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOHSA-NEXT: s_mov_b32 s8, s6 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13 -; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 -; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28 -; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29 -; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30 -; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31 -; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v15 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v13 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v14 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v14 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v15 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v10 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v10 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v5 +; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v11 +; SI-NOHSA-NEXT: buffer_store_dword v52, off, s[12:15], 0 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v53, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v54, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v55, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v5 +; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v6 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v7 +; SI-NOHSA-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; SI-NOHSA-NEXT: s_waitcnt expcnt(4) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v1 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v1 +; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v2 +; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v3 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(11) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v19 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v18 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v17 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v16 +; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v16 +; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v17 +; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v18 +; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v19 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(10) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v23 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v22 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v21 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v20 +; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v20 +; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v21 +; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v22 +; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v23 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(9) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v31 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v30 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5 -; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v29 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v28 +; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v28 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v29 +; SI-NOHSA-NEXT: v_mov_b32_e32 v17, v30 +; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v31 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1 -; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2 -; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 -; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16 -; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17 -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 -; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20 -; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21 -; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22 -; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24 -; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24 -; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25 -; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26 -; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v35 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v34 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v33 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v32 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v32 +; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v33 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v34 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v35 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 -; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:224 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:240 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; SI-NOHSA-NEXT: buffer_load_dword v32, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v35, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:160 +; SI-NOHSA-NEXT: s_waitcnt expcnt(1) +; SI-NOHSA-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: buffer_load_dword v35, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:96 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:64 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: @@ -3204,26 +3268,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3232,8 +3296,8 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 @@ -3243,124 +3307,124 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31 +; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[37:38], v[32:35] +; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v42, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v41, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v31 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v30 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, v30 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, v31 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[33:36] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[37:38], v[28:31] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v40, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s4 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v27 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v26 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, v26 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, v27 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[39:40], v[29:32] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v21 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v21 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v20 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v20 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v21 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v22 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v23 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[41:42], v[25:28] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v41, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v40, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v13 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v13 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v23 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v22 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v22 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v23 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[30:33] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[25:28] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v15 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[21:24] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v5 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[40:41], v[26:29] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v40, s3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v17 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[22:25] +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v18 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v18 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v19 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v19 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, v9 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[35:36], v[23:26] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[37:38], v[4:7] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[39:40], v[31:34] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v11 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, v11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -3370,15 +3434,15 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[27:30] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: @@ -3391,9 +3455,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 @@ -3402,93 +3466,92 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v7 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v11 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v15 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v13 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v9 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v14 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v13 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v15 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v9 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v10 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v11 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v16 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v16 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v20 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v21 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v22 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v23 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v23 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v6 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v5 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v6 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v7 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v19 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v2, 31, v18 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v17 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v17 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, v18 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v19 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v22 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v21 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v20 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v60, v20 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v62, v21 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v13, v22 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v23 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v27 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:224 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v25 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:240 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v31 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v46, 31, v29 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v26 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v24 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v30 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v44, 31, v28 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v43, v28 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v45, v29 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:208 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:160 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:144 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:96 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:32 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v30 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v31 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v24 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v25 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v26 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v27 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i32_to_v32i64: @@ -3661,117 +3724,117 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3] ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v61, 0 ; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:96 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[12:15], v61, s[2:3] offset:112 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v61, s[2:3] offset:80 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:64 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:48 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:32 +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:16 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v52 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v8 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v9 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v10 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v11 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v29, off, s[20:23], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v15 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v13 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v14 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v13 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v14 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v15 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v51 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v51 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v52 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v50 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v12 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v12 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v49 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v49 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v50 +; GCN-GFX900-HSA-NEXT: buffer_store_dword v13, off, s[20:23], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v30, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v31, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v32, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v13 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v14 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v15 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v16 +; GCN-GFX900-HSA-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v7 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v5 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v6 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v4 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v5 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v6 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v7 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v49, v17 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v51, v18 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v20 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v19 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v18 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v17 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v17 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v18 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v19 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v20 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v22 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v21 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v21 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v22 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] +; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[37:40], s[0:1] offset:224 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[33:36], s[0:1] offset:240 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[41:44], s[0:1] offset:192 +; GCN-GFX900-HSA-NEXT: buffer_load_dword v37, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v36, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v25 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28 +; GCN-GFX900-HSA-NEXT: buffer_load_dword v38, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v39, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v40, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v28 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v26 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v27 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v25 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v49, v25 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v51, v26 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v27 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v28 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v2 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v2 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v3 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v23 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v24 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v22 +; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v21 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v21 +; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v22 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[37:40], s[0:1] offset:208 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[29:32], s[0:1] offset:160 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[45:48], s[0:1] offset:176 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[49:52], s[0:1] offset:128 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[8:11], s[0:1] offset:144 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[53:56], s[0:1] offset:96 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[4:7], s[0:1] offset:112 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[57:60], s[0:1] offset:64 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[17:20], s[0:1] offset:80 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[25:28], s[0:1] offset:32 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[13:16], s[0:1] offset:48 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[33:36], s[0:1] +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v61, v[0:3], s[0:1] offset:16 ; GCN-GFX900-HSA-NEXT: s_endpgm ; ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: @@ -3898,35 +3961,35 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3 -; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOHSA-NEXT: s_mov_b32 s8, s6 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7 -; SI-NOHSA-NEXT: s_mov_b32 s0, s4 -; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80 +; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1 +; SI-NOHSA-NEXT: s_mov_b32 s0, s4 +; SI-NOHSA-NEXT: s_mov_b32 s1, s5 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32 @@ -3969,12 +4032,12 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; @@ -3992,7 +4055,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0 @@ -4004,7 +4067,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] @@ -4013,123 +4076,122 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s5 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s4 +; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[0:3] +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 @@ -4163,75 +4225,75 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v31, v29 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v33, 0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v33 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v1 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v1 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v3 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v2 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v3 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(8) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v5 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v5 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v7 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v6 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v7 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v9 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:160 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v10 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v11 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v13 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:128 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v14 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v15 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v15 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(11) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v16 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v17 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v16 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v17 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v18 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v19 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v18 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v19 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v20 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v21 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v20 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v21 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v22 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v23 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v22 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v23 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(13) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v24 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v25 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v24 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v25 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v26 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v27 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v26 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v27 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(14) -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v32 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v33 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v28 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v29 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 ; GCNX3-NOHSA-NEXT: s_nop 0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v34 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v35 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v30 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v31 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i32_to_v32i64: @@ -4479,8 +4541,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOHSA-NEXT: s_mov_b32 s4, s0 -; SI-NOHSA-NEXT: s_mov_b32 s5, s1 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 @@ -4491,6 +4551,8 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112 @@ -4514,14 +4576,13 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4 +; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 @@ -4534,70 +4595,69 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s4 +; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s7 +; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s6 +; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 48 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCNX3-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCNX3-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCNX3-HSA-NEXT: s_add_u32 s10, s0, 48 -; GCNX3-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCNX3-HSA-NEXT: s_nop 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s9 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[4:7] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[28:31] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_load_v32i32: @@ -4610,22 +4670,24 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:80 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:96 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:112 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:64 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:80 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..62a785826cb4b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -2085,39 +2085,39 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v3 -; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 +; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i32: @@ -2324,39 +2324,39 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v3 +; GCN-HSA-NEXT: v_bfe_i32 v13, v3, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v3, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v1 +; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v7, v1, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i32: @@ -2545,12 +2545,16 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v2, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v3 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v2, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v3, 8, 8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v0, 16, 8 @@ -2559,30 +2563,26 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v2 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v2, 16, 8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v7 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v3, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v5, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v6, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v7, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v4, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v5, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xff, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v30, v6, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v7, 16, 8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xff, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v30, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xff, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v38, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2593,89 +2593,89 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v7 -; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v7 -; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v6 -; GCN-HSA-NEXT: v_bfe_u32 v8, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6 -; GCN-HSA-NEXT: v_bfe_u32 v9, v6, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v7 +; GCN-HSA-NEXT: v_bfe_u32 v11, v7, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GCN-HSA-NEXT: v_bfe_u32 v11, v6, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v6 +; GCN-HSA-NEXT: v_bfe_u32 v12, v6, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v7, v5, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GCN-HSA-NEXT: v_bfe_u32 v8, v5, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v6, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v7, v4, 16, 8 +; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v5 +; GCN-HSA-NEXT: v_bfe_u32 v13, v5, 16, 8 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v4 +; GCN-HSA-NEXT: v_bfe_u32 v14, v4, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[12:15] ; GCN-HSA-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GCN-HSA-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GCN-HSA-NEXT: v_bfe_u32 v5, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v11, v2, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] +; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GCN-HSA-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v15, v1, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i32: @@ -2695,16 +2695,19 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v7, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v7, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v37, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xff, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v38, v7, 16, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v1, 8, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v2, 8, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v1, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v2, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v3, 8, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v0, 16, 8 @@ -2714,22 +2717,19 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v2, 16, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v3, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v5, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v6, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v4, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v5, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v6, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2956,44 +2956,44 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v6 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v0, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v0, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v0, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v1 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v1, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v1, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v1, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v2 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v2, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v2, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v3, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v3, 8, 8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 24, v7 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v3, 0, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v4, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v4, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v5, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v6, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v6, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v6, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v7, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v7, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v7, 0, 8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v4, 0, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v5, 0, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v6, 0, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v37, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v7, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -3017,76 +3017,76 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v7 +; GCN-HSA-NEXT: v_bfe_i32 v11, v7, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v10, v7, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[9:12] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 24, v4 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v10, v6, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 24, v5 -; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v5, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v5 +; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v5, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v5, v4, 0, 8 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[5:8] +; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[14:17] +; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[2:5] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[6:9] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i8_to_v32i32: @@ -3106,41 +3106,41 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v7, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v7, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v37, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v7, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v6 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v0, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v1 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v1, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v2, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v3, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v3, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v6, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v6, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v6, 0, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v4, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v5, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v6, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -3413,42 +3413,42 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v13, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v12, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v15, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v14, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v13, 16, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v9, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v8, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v11, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v10, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v9, 16, 8 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v12, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v15, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v14, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v8, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v11, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v10, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v1, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v0, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v3, 8, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 24, v2 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v41, v2, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v1, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v1, 16, 8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v0, 16, 8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xff, v3 @@ -3502,10 +3502,10 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload @@ -3533,12 +3533,12 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 @@ -3555,119 +3555,113 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v17, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v2 ; GCN-HSA-NEXT: v_bfe_u32 v18, v2, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; GCN-HSA-NEXT: v_bfe_u32 v17, v14, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v14 -; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v15 -; GCN-HSA-NEXT: v_bfe_u32 v17, v15, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v15 -; GCN-HSA-NEXT: v_bfe_u32 v18, v15, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v14 +; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v14 +; GCN-HSA-NEXT: v_bfe_u32 v19, v14, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 24, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[17:20] +; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8 +; GCN-HSA-NEXT: v_bfe_u32 v19, v15, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v15 +; GCN-HSA-NEXT: v_bfe_u32 v20, v15, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[16:19] -; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[18:21] +; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v12 -; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v13 -; GCN-HSA-NEXT: v_bfe_u32 v15, v13, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v13 -; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v10 -; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v11 -; GCN-HSA-NEXT: v_bfe_u32 v13, v11, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v11 -; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 24, v13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[14:17] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v13 +; GCN-HSA-NEXT: v_bfe_u32 v17, v13, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s11 +; GCN-HSA-NEXT: v_bfe_u32 v17, v10, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v10 +; GCN-HSA-NEXT: v_bfe_u32 v18, v10, 16, 8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 -; GCN-HSA-NEXT: v_bfe_u32 v11, v8, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9 -; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v9 -; GCN-HSA-NEXT: v_bfe_u32 v12, v9, 16, 8 +; GCN-HSA-NEXT: v_bfe_u32 v18, v11, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v11 +; GCN-HSA-NEXT: v_bfe_u32 v19, v11, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v8 +; GCN-HSA-NEXT: v_bfe_u32 v14, v8, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v8 +; GCN-HSA-NEXT: v_bfe_u32 v15, v8, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] -; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[13:16] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v15, v9, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v9 +; GCN-HSA-NEXT: v_bfe_u32 v16, v9, 16, 8 +; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_u32 v18, v1, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v1 -; GCN-HSA-NEXT: v_bfe_u32 v19, v1, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v14, v0, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v0 +; GCN-HSA-NEXT: v_bfe_u32 v15, v0, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v10, v1, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v6 -; GCN-HSA-NEXT: v_bfe_u32 v18, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v6 -; GCN-HSA-NEXT: v_bfe_u32 v19, v6, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[17:20] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 24, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[8:11] +; GCN-HSA-NEXT: v_bfe_u32 v16, v4, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v10, v5, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v5 +; GCN-HSA-NEXT: v_bfe_u32 v11, v5, 16, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v4 +; GCN-HSA-NEXT: v_bfe_u32 v17, v4, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v1, v7, 8, 8 @@ -3680,11 +3674,16 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GCN-HSA-NEXT: v_bfe_u32 v20, v6, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v6 +; GCN-HSA-NEXT: v_bfe_u32 v21, v6, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i8_to_v64i32: @@ -3724,18 +3723,18 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v38, 16, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v16, 8, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v19, 8, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v16, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v19, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v18, 8, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v16 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v16, 16, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v19 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v19, 16, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v18, 16, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v29 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v18, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v29, 8, 8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v28 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v28, 8, 8 @@ -4332,150 +4331,150 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v14 ; GCN-HSA-NEXT: v_bfe_i32 v18, v14, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v17, v14, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v15 -; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v15, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 24, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 24, v12 +; GCN-HSA-NEXT: v_bfe_i32 v19, v15, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v18, v15, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v12 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v19, v12, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v18, v12, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v13, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v10 -; GCN-HSA-NEXT: v_bfe_i32 v14, v10, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v10, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v10, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[14:17] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v11, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v11, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v11, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v11 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v11, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[15:18] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v8, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v8, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v8, 0, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v8, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v6 -; GCN-HSA-NEXT: v_bfe_i32 v13, v6, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v6, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v7, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v7, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v10, v7, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v4 -; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0 -; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v5 -; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v5, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v17, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v4 +; GCN-HSA-NEXT: v_bfe_i32 v13, v4, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 8 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 24, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v7, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v5, v1, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v0, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v0, 0, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 24, v3 +; GCN-HSA-NEXT: v_bfe_i32 v21, v3, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v20, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v19, v3, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v25, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i8_to_v64i32: @@ -4493,7 +4492,7 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48 @@ -4501,10 +4500,10 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v11, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v11, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v17, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v17, 0, 8 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill @@ -4514,19 +4513,19 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v36, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v36, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v10, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v10, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v13, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v13, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v13, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v12, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v16, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v16, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v16, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v19, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v19, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v19, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v18, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v18, 8, 8 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v18, 0, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v27, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v27, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v27, 0, 8 @@ -6070,45 +6069,45 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx2 v[15:16], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v16 -; GCN-HSA-NEXT: v_bfe_u32 v0, v16, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v15 -; GCN-HSA-NEXT: v_bfe_u32 v3, v15, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v16, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v15, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GCN-HSA-NEXT: v_bfe_u32 v3, v1, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v13, v0, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[3:6] -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i8_to_v8i64: @@ -6553,18 +6552,18 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v3, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v3, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v0, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v1, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v2, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v1, 16, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 24, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v1, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v27, v0, 16, 8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 24, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_u32 v31, v3, 16, 8 @@ -6578,12 +6577,12 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 @@ -6591,9 +6590,9 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i64: @@ -6609,74 +6608,71 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v5 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_bfe_u32 v11, v0, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 -; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v1 +; GCN-HSA-NEXT: v_bfe_u32 v12, v1, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v8, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[6:9] ; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 +; GCN-HSA-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v17, v1, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v5 +; GCN-HSA-NEXT: v_bfe_u32 v15, v0, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i64: @@ -6690,49 +6686,49 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v3, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v1, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v0, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v3, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v1, 16, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v3, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 24, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v0, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v2, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v12, v0, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v2, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v1, 16, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v3, 16, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i64: @@ -6931,88 +6927,86 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s1, s9, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s7, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[0:1], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s6, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s8, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s7, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s9, 8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64: @@ -7025,111 +7019,111 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 -; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 -; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 -; GCN-HSA-NEXT: s_mov_b32 s22, s3 -; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-HSA-NEXT: s_lshr_b32 s16, s14, 8 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 8 +; GCN-HSA-NEXT: s_lshr_b32 s10, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s14, 24 +; GCN-HSA-NEXT: s_lshr_b32 s18, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GCN-HSA-NEXT: s_lshr_b32 s22, s15, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: s_lshr_b32 s14, s15, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: s_mov_b32 s16, s15 +; GCN-HSA-NEXT: s_mov_b32 s24, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: s_ashr_i32 s28, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s29, s15, 24 +; GCN-HSA-NEXT: s_ashr_i32 s30, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s31, s7, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s26 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s9 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[14:17] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[18:21] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[22:25] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64: @@ -7143,84 +7137,83 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s6, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[0:1], 0x80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s6, 24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s6, 8 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s8, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s7, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s8, 24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 8 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s7, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, s16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s7, 31 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, s15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i64: @@ -7444,137 +7437,145 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[17:20], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[21:24], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v12, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v11, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v10, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v55, v13, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xff, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v10, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v11, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v12, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v13, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v18, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v17, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v16, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v15, 8, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xff, v18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v15, v15, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v16, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v17, 16, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v18, 16, 8 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v43, v17, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xff, v17 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v47, v18, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xff, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v51, v19, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xff, v19 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v20, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xff, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v17 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v17, 16, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 24, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v43, v21, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xff, v21 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v47, v22, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xff, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v51, v23, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xff, v23 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v24, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v21 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v21, 16, 8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v18 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v18, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v19, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v8, v20, 16, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v36, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v53, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v54, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v55, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v56, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v22, 16, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v23 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v23, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v53, v24, 16, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v36, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v54 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v54 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v56, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v54 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v54 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v54 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i64: @@ -7586,134 +7587,142 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v9 +; GCN-HSA-NEXT: v_bfe_u32 v17, v9, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-HSA-NEXT: v_bfe_u32 v20, v8, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v9 -; GCN-HSA-NEXT: v_bfe_u32 v10, v9, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v8 -; GCN-HSA-NEXT: v_bfe_u32 v10, v8, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7 -; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GCN-HSA-NEXT: v_bfe_u32 v10, v6, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 -; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v4 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GCN-HSA-NEXT: v_bfe_u32 v14, v7, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GCN-HSA-NEXT: v_bfe_u32 v17, v6, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 24, v5 +; GCN-HSA-NEXT: v_bfe_u32 v20, v5, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; GCN-HSA-NEXT: v_bfe_u32 v14, v4, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_bfe_u32 v19, v11, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_bfe_u32 v22, v10, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 +; GCN-HSA-NEXT: v_bfe_u32 v16, v9, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_bfe_u32 v18, v8, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v0, v5, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v11 +; GCN-HSA-NEXT: v_bfe_u32 v0, v11, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 +; GCN-HSA-NEXT: v_bfe_u32 v14, v6, 8, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v6 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-HSA-NEXT: v_bfe_u32 v18, v5, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i64: @@ -7726,92 +7735,92 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v53 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v54 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v56, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v54 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v11, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v20, 8, 8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 24, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v48, v15, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 24, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v16, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v44, v16, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v53 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v41, v15, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v53 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xff, v15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v53 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v36, v14, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 24, v10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v53 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v10, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v3, v10, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v12, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 24, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v12, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v13, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 24, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v11, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 24, v23 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v49, v23, 16, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 24, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v46, v24, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v24, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v54 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xff, v24 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v42, v23, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v54 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xff, v23 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v19, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v54 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 24, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v18, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 24, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v21, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v3, v18, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v15, v20, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xff, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v19, 16, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 24, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v21, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v22, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 24, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v37, v22, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v25, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xff, v25 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 24, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v53, v25, 16, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v54 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v54 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v54 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v54 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -8173,169 +8182,168 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s12, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s1, s13, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s13 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s29 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s6, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s7, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s27 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s10, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s25 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s10, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s10, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s9, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s23 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s13, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s11 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s11, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s10, s13, 31 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s7, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s9, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s7, 31 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64: @@ -8348,82 +8356,76 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s11, v1 ; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 +; GCN-HSA-NEXT: s_lshr_b32 s2, s10, 24 +; GCN-HSA-NEXT: s_lshr_b32 s4, s10, 8 ; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-HSA-NEXT: s_mov_b32 s28, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s6, s11, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[18:19], 0x80000 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s42, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s43, v5 +; GCN-HSA-NEXT: s_lshr_b32 s8, s10, 16 ; GCN-HSA-NEXT: s_mov_b32 s22, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s34, s11, 8 +; GCN-HSA-NEXT: s_mov_b32 s10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[16:17], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 -; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 -; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 -; GCN-HSA-NEXT: s_mov_b32 s4, s45 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 -; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s36 +; GCN-HSA-NEXT: s_lshr_b32 s44, s42, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s42, 24 +; GCN-HSA-NEXT: s_lshr_b32 s36, s42, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s43, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s43, 8 +; GCN-HSA-NEXT: s_mov_b32 s4, s43 +; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s40, 24 +; GCN-HSA-NEXT: s_lshr_b32 s60, s40, 8 ; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 ; GCN-HSA-NEXT: s_mov_b32 s14, s41 -; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 -; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s53, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s54, s11, 24 +; GCN-HSA-NEXT: s_ashr_i32 s55, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s56, s7, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 -; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 -; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s37 +; GCN-HSA-NEXT: s_ashr_i32 s33, s43, 31 +; GCN-HSA-NEXT: s_ashr_i32 s52, s43, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[42:43], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s57, s41, 31 +; GCN-HSA-NEXT: s_ashr_i32 s58, s41, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 @@ -8431,93 +8433,99 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[46:47], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[60:61], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s61 +; GCN-HSA-NEXT: s_add_u32 s60, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GCN-HSA-NEXT: s_addc_u32 s61, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s49 +; GCN-HSA-NEXT: s_add_u32 s22, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s24 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s43 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[14:17] +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s23 +; GCN-HSA-NEXT: s_addc_u32 s23, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 -; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[22:25] +; GCN-HSA-NEXT: flat_store_dwordx4 v[33:34], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[26:29] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8535,8 +8543,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8555,8 +8563,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -10808,41 +10816,41 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff00, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; GCN-HSA-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GCN-HSA-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00ff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00ff, v0 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-HSA-NEXT: v_alignbit_b32 v2, v14, v2, 16 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GCN-HSA-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 +; GCN-HSA-NEXT: v_or_b32_e32 v1, v1, v13 +; GCN-HSA-NEXT: v_or_b32_e32 v7, v16, v7 +; GCN-HSA-NEXT: v_or_b32_e32 v5, v17, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[1:4] +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16: @@ -11767,72 +11775,71 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff00, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff00, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff00, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 24, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v4 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v18, v5, 16 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v16, v16, v4, 16 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v21, v14, v7, 16 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v22, v10, v6, 16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v6 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v17 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v19, v4 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v20, v10 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v13 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v7, v10 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v23, v8 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 8, v12 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v14, v14, v7 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v12, v18, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v18, v11, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 24, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v22 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v13, v1, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff00ff, v1 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v13, v0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff00ff, v0 +; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00ff, v0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v16, v1, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i16: @@ -11844,88 +11851,88 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[7:10], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff00, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v2 +; GCN-HSA-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; GCN-HSA-NEXT: v_alignbit_b32 v2, v12, v2, 16 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00ff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00ff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v0 +; GCN-HSA-NEXT: v_or_b32_e32 v13, v17, v13 +; GCN-HSA-NEXT: v_or_b32_e32 v11, v18, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xff00, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xff00, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 24, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_alignbit_b32 v11, v15, v0, 16 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff00, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 24, v7 +; GCN-HSA-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v7 +; GCN-HSA-NEXT: v_alignbit_b32 v8, v24, v8, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GCN-HSA-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v21 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xff, v10 +; GCN-HSA-NEXT: v_alignbit_b32 v22, v22, v7, 16 +; GCN-HSA-NEXT: v_alignbit_b32 v24, v17, v10, 16 +; GCN-HSA-NEXT: v_alignbit_b32 v20, v20, v9, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xff, v9 +; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-HSA-NEXT: v_or_b32_e32 v6, v3, v4 +; GCN-HSA-NEXT: v_or_b32_e32 v4, v16, v5 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v8 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v19 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v11 +; GCN-HSA-NEXT: v_or_b32_e32 v10, v25, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v20 +; GCN-HSA-NEXT: v_or_b32_e32 v8, v26, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v22 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16: @@ -11943,10 +11950,27 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v4 +; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v8, v8, v2, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xff00ff, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff0000, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v10, v10, v4, 16 +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v12, v9, v0, 16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff0000, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v5, v10, v6, 16 +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 @@ -11954,9 +11978,8 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v6 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 ; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 @@ -11971,14 +11994,9 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v12 +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff0000, v7 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 @@ -11987,35 +12005,24 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 ; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 ; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13 ; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 ; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 ; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 ; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: @@ -12704,17 +12711,15 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5 ; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 24 ; GCN-HSA-NEXT: s_bfe_i32 s7, s5, 0x80010 ; GCN-HSA-NEXT: s_bfe_i32 s8, s5, 0x80008 @@ -12729,11 +12734,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_sext_i32_i8 s3, s3 ; GCN-HSA-NEXT: s_ashr_i32 s15, s2, 24 ; GCN-HSA-NEXT: s_bfe_i32 s16, s2, 0x80010 -; GCN-HSA-NEXT: s_ashr_i32 s22, s21, 24 -; GCN-HSA-NEXT: s_bfe_i32 s23, s21, 0x80010 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s18, v6 ; GCN-HSA-NEXT: v_readfirstlane_b32 s19, v7 ; GCN-HSA-NEXT: v_readfirstlane_b32 s20, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5 ; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16 @@ -12748,55 +12753,57 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 16 ; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s22, s22, 16 -; GCN-HSA-NEXT: s_and_b32 s23, s23, 0xffff ; GCN-HSA-NEXT: s_bfe_i32 s17, s2, 0x80008 +; GCN-HSA-NEXT: s_ashr_i32 s22, s21, 24 +; GCN-HSA-NEXT: s_bfe_i32 s23, s21, 0x80010 ; GCN-HSA-NEXT: s_bfe_i32 s24, s21, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s21, s21 ; GCN-HSA-NEXT: s_ashr_i32 s25, s20, 24 +; GCN-HSA-NEXT: s_bfe_i32 s26, s20, 0x80010 ; GCN-HSA-NEXT: s_or_b32 s6, s7, s6 -; GCN-HSA-NEXT: s_bfe_i32 s7, s20, 0x80010 -; GCN-HSA-NEXT: s_or_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_bfe_i32 s8, s20, 0x80008 +; GCN-HSA-NEXT: s_bfe_i32 s7, s20, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s20, s20 +; GCN-HSA-NEXT: s_or_b32 s5, s5, s8 +; GCN-HSA-NEXT: s_ashr_i32 s8, s19, 24 ; GCN-HSA-NEXT: s_or_b32 s9, s10, s9 -; GCN-HSA-NEXT: s_ashr_i32 s10, s19, 24 +; GCN-HSA-NEXT: s_bfe_i32 s10, s19, 0x80010 ; GCN-HSA-NEXT: s_or_b32 s4, s4, s11 -; GCN-HSA-NEXT: s_bfe_i32 s11, s19, 0x80010 -; GCN-HSA-NEXT: s_or_b32 s12, s13, s12 -; GCN-HSA-NEXT: s_bfe_i32 s13, s19, 0x80008 +; GCN-HSA-NEXT: s_bfe_i32 s11, s19, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s19, s19 +; GCN-HSA-NEXT: s_or_b32 s12, s13, s12 +; GCN-HSA-NEXT: s_ashr_i32 s13, s18, 24 ; GCN-HSA-NEXT: s_or_b32 s3, s3, s14 -; GCN-HSA-NEXT: s_ashr_i32 s14, s18, 24 +; GCN-HSA-NEXT: s_bfe_i32 s14, s18, 0x80010 ; GCN-HSA-NEXT: s_or_b32 s15, s16, s15 -; GCN-HSA-NEXT: s_bfe_i32 s16, s18, 0x80010 -; GCN-HSA-NEXT: s_or_b32 s22, s23, s22 -; GCN-HSA-NEXT: s_bfe_i32 s23, s18, 0x80008 +; GCN-HSA-NEXT: s_bfe_i32 s16, s18, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s18, s18 ; GCN-HSA-NEXT: s_sext_i32_i8 s2, s2 ; GCN-HSA-NEXT: s_lshl_b32 s17, s17, 16 +; GCN-HSA-NEXT: s_lshl_b32 s22, s22, 16 +; GCN-HSA-NEXT: s_and_b32 s23, s23, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s24, s24, 16 ; GCN-HSA-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s25, s25, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-HSA-NEXT: s_and_b32 s26, s26, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 16 ; GCN-HSA-NEXT: s_and_b32 s20, s20, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 16 +; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 16 ; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s23, s23, 16 +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 16 +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s16, s16, 16 ; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_or_b32 s22, s23, s22 ; GCN-HSA-NEXT: s_or_b32 s21, s21, s24 -; GCN-HSA-NEXT: s_or_b32 s7, s7, s25 -; GCN-HSA-NEXT: s_or_b32 s8, s20, s8 -; GCN-HSA-NEXT: s_or_b32 s10, s11, s10 -; GCN-HSA-NEXT: s_or_b32 s11, s19, s13 -; GCN-HSA-NEXT: s_or_b32 s13, s16, s14 -; GCN-HSA-NEXT: s_or_b32 s14, s18, s23 +; GCN-HSA-NEXT: s_or_b32 s23, s26, s25 +; GCN-HSA-NEXT: s_or_b32 s7, s20, s7 +; GCN-HSA-NEXT: s_or_b32 s8, s10, s8 +; GCN-HSA-NEXT: s_or_b32 s10, s19, s11 +; GCN-HSA-NEXT: s_or_b32 s11, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s13, s18, s16 ; GCN-HSA-NEXT: s_or_b32 s2, s2, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 @@ -12818,16 +12825,16 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index ddd1ce66c013a..6917d1ef10a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1525,6 +1525,7 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v20, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 @@ -1536,18 +1537,17 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; SI-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32: @@ -1558,29 +1558,29 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v0 +; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v3 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32: @@ -1590,29 +1590,29 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v0 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v3 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v16i16_to_v16i32: @@ -1723,29 +1723,29 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v20, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 -; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32 -; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] +; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; VI-DS128-NEXT: ds_write_b128 v20, v[0:3] offset:48 +; VI-DS128-NEXT: ds_write_b128 v20, v[16:19] offset:32 +; VI-DS128-NEXT: ds_write_b128 v20, v[12:15] offset:16 +; VI-DS128-NEXT: ds_write_b128 v20, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i32: @@ -1755,29 +1755,29 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] +; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[0:3] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[16:19] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[12:15] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i32> @@ -1794,6 +1794,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v20, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0 @@ -1805,18 +1806,17 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v14, v2, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v6 +; SI-NEXT: v_bfe_i32 v14, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v18, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v0, v7, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v6, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; SI-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -1827,29 +1827,29 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -1859,29 +1859,29 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[0:1] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[18:19], v[16:17] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i32: @@ -2002,29 +2002,29 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v20, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 +; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v20, v[0:3] offset:48 +; VI-DS128-NEXT: ds_write_b128 v20, v[16:19] offset:32 +; VI-DS128-NEXT: ds_write_b128 v20, v[12:15] offset:16 +; VI-DS128-NEXT: ds_write_b128 v20, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -2034,29 +2034,29 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[0:3] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[16:19] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[12:15] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v20, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i32> @@ -2080,46 +2080,46 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2 -; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v32, s0 -; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v15, s0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: ds_write2_b64 v15, v[12:13], v[10:11] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v15, v[8:9], v[6:7] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v15, v[4:5], v[2:3] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v15, v[0:1], v[32:33] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v15, v[30:31], v[28:29] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v15, v[26:27], v[24:25] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v15, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: @@ -2127,112 +2127,112 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v0 +; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v11 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[10:11] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[8:9], v[6:7] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[4:5], v[2:3] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[32:33] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[30:31], v[28:29] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[26:27], v[24:25] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[22:23], v[20:21] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v0 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v11 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[10:11] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[8:9], v[6:7] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[4:5], v[2:3] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[32:33] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[30:31], v[28:29] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[26:27], v[24:25] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[22:23], v[20:21] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i32: @@ -2437,108 +2437,110 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v20, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v20 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 -; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v12 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v10 +; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 -; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 -; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 -; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; VI-DS128-NEXT: v_mov_b32_e32 v13, s0 +; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; VI-DS128-NEXT: ds_write_b128 v13, v[8:11] offset:96 +; VI-DS128-NEXT: ds_write_b128 v13, v[4:7] offset:112 +; VI-DS128-NEXT: ds_write_b128 v13, v[0:3] offset:64 +; VI-DS128-NEXT: ds_write_b128 v13, v[32:35] offset:80 +; VI-DS128-NEXT: ds_write_b128 v13, v[28:31] offset:32 +; VI-DS128-NEXT: ds_write_b128 v13, v[24:27] offset:48 +; VI-DS128-NEXT: ds_write_b128 v13, v[20:23] +; VI-DS128-NEXT: ds_write_b128 v13, v[16:19] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v20 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 -; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v12 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v10 +; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s0 +; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[8:11] offset:96 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[4:7] offset:112 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[0:3] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[32:35] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[28:31] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[24:27] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[20:23] +; GFX9-DS128-NEXT: ds_write_b128 v13, v[16:19] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i32> @@ -2562,46 +2564,46 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 ; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5 +; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v4 +; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v9 ; SI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v18, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v20, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; SI-NEXT: v_bfe_i32 v22, v2, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5 -; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4 -; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7 -; SI-NEXT: v_bfe_i32 v4, v7, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; SI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9 -; SI-NEXT: v_bfe_i32 v24, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 -; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 -; SI-NEXT: v_bfe_i32 v26, v11, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 -; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v24, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v26, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v28, v7, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13 -; SI-NEXT: v_bfe_i32 v28, v13, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; SI-NEXT: v_bfe_i32 v30, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 -; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 -; SI-NEXT: v_mov_b32_e32 v32, s0 -; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v13 +; SI-NEXT: v_bfe_i32 v30, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v32, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v11, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 +; SI-NEXT: v_bfe_i32 v4, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v13, 0, 16 +; SI-NEXT: v_bfe_i32 v8, v12, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v15, 0, 16 +; SI-NEXT: v_mov_b32_e32 v15, s0 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v14 +; SI-NEXT: v_bfe_i32 v12, v14, 0, 16 +; SI-NEXT: ds_write2_b64 v15, v[12:13], v[10:11] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v15, v[8:9], v[6:7] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v15, v[4:5], v[2:3] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v15, v[0:1], v[32:33] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v15, v[30:31], v[28:29] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v15, v[26:27], v[24:25] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v15, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: @@ -2609,112 +2611,112 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v2 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v4 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; VI-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v9 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v1, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v28, v5, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; VI-NO-DS128-NEXT: v_bfe_i32 v30, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v32, v9, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v8, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 -; VI-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v14 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v10, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v14, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; VI-NO-DS128-NEXT: v_bfe_i32 v12, v12, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v24, v7, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v26, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[10:11] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[8:9], v[6:7] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[4:5], v[2:3] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[32:33] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[30:31], v[28:29] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[26:27], v[24:25] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[22:23], v[20:21] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v2 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v5 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v4 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v9 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v32, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v14 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[10:11] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[8:9], v[6:7] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[4:5], v[2:3] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[32:33] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[30:31], v[28:29] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[26:27], v[24:25] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[22:23], v[20:21] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i32: @@ -2938,112 +2940,110 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v24 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 +; VI-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v12 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:48 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v0 +; VI-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v22, v1, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; VI-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v10 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v34, v11, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v32, v10, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 -; VI-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 -; VI-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 -; VI-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 -; VI-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 -; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; VI-DS128-NEXT: v_bfe_i32 v0, v8, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; VI-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v13, s0 +; VI-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v7 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v26, v7, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v24, v6, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; VI-DS128-NEXT: v_bfe_i32 v28, v4, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v14 +; VI-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v14, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v13, v[8:11] offset:96 +; VI-DS128-NEXT: ds_write_b128 v13, v[4:7] offset:112 +; VI-DS128-NEXT: ds_write_b128 v13, v[0:3] offset:64 +; VI-DS128-NEXT: ds_write_b128 v13, v[32:35] offset:80 +; VI-DS128-NEXT: ds_write_b128 v13, v[28:31] offset:32 +; VI-DS128-NEXT: ds_write_b128 v13, v[24:27] offset:48 +; VI-DS128-NEXT: ds_write_b128 v13, v[20:23] +; VI-DS128-NEXT: ds_write_b128 v13, v[16:19] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v24 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v12 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:48 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v1 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v0 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v10 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v34, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v32, v10, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 -; GFX9-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v13 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s0 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v7 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v4 +; GFX9-DS128-NEXT: v_bfe_i32 v26, v7, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v15 +; GFX9-DS128-NEXT: v_bfe_i32 v28, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v14 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v14, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[8:11] offset:96 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[4:7] offset:112 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[0:3] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[32:35] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[28:31] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[24:27] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v13, v[20:23] +; GFX9-DS128-NEXT: ds_write_b128 v13, v[16:19] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i32> @@ -3066,12 +3066,12 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9 ; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11 -; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13 -; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15 +; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:12 offset1:13 +; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:14 offset1:15 ; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1 ; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[34:37], v24 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[38:41], v24 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[36:39], v24 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[40:43], v24 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 @@ -3079,38 +3079,38 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1 ; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 ; SI-NEXT: s_waitcnt lgkmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_and_b32_e32 v44, 0xffff, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_and_b32_e32 v46, 0xffff, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_and_b32_e32 v44, 0xffff, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_and_b32_e32 v46, 0xffff, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 ; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21 @@ -3130,38 +3130,38 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v35 -; SI-NEXT: v_and_b32_e32 v56, 0xffff, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 -; SI-NEXT: v_and_b32_e32 v58, 0xffff, v37 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: v_and_b32_e32 v56, 0xffff, v37 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v39 -; SI-NEXT: v_and_b32_e32 v60, 0xffff, v39 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v39 +; SI-NEXT: v_and_b32_e32 v58, 0xffff, v39 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 -; SI-NEXT: v_and_b32_e32 v62, 0xffff, v41 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_and_b32_e32 v60, 0xffff, v41 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 ; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; SI-NEXT: v_and_b32_e32 v62, 0xffff, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v42 +; SI-NEXT: v_and_b32_e32 v42, 0xffff, v42 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v0, v[42:43], v[62:63] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v0, v[40:41], v[60:61] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v0, v[38:39], v[58:59] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v0, v[36:37], v[56:57] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27 -; SI-NEXT: ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23 -; SI-NEXT: ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31 +; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[24:25] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload @@ -3177,112 +3177,112 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v56 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v13 ; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v14 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v22 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v20 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v20 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:10 offset1:11 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v22 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; VI-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; VI-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v20 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; VI-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v18 +; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; VI-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v59, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v62, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v61, 0xffff, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; VI-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; VI-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[62:63] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[60:61] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[58:59] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[56:57] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[38:39], v[36:37] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: buffer_load_dword v4, off, s[88:91], 0 ; 4-byte Folded Reload +; VI-NO-DS128-NEXT: buffer_load_dword v5, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[2:3], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: @@ -3294,104 +3294,104 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v56 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v13 ; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: s_nop 0 ; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v14 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v18 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v20 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v21 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v20 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v22 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v18 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v21 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v20 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v22 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v20 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v23 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v18 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v21 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[62:63] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[60:61] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[58:59] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[56:57] offset0:24 offset1:25 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[38:39], v[36:37] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3 @@ -3802,17 +3802,22 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b128 v[8:11], v0 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v0 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; VI-DS128-NEXT: v_mov_b32_e32 v35, v31 +; VI-DS128-NEXT: v_mov_b32_e32 v31, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill @@ -3835,6 +3840,12 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill +; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v14 +; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v12 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 ; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 @@ -3842,18 +3853,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 ; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v25 +; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-DS128-NEXT: v_mov_b32_e32 v31, v15 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3864,25 +3876,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3904,7 +3907,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 ; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 ; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 -; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 +; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:96 ; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 ; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload @@ -3925,8 +3928,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] +; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32: @@ -3938,15 +3941,18 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v13 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GFX9-DS128-NEXT: v_mov_b32_e32 v35, v31 +; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 @@ -3975,6 +3981,12 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v12 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 ; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 @@ -3982,18 +3994,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v25 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4004,24 +4017,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4043,7 +4048,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload @@ -4064,8 +4069,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] +; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> @@ -4091,9 +4096,9 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:12 offset1:13 ; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15 ; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1 -; SI-NEXT: ds_read2_b64 v[30:33], v20 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[34:37], v20 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[38:41], v20 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[32:35], v20 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[36:39], v20 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[40:43], v20 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(7) ; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v4 @@ -4101,27 +4106,27 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v6 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v1 +; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v0 ; SI-NEXT: v_bfe_i32 v20, v5, 0, 16 ; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; SI-NEXT: v_bfe_i32 v22, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v24, v7, 0, 16 ; SI-NEXT: v_bfe_i32 v26, v6, 0, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v3 ; SI-NEXT: v_bfe_i32 v28, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v6, v3, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2 -; SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v30, v0, 0, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 +; SI-NEXT: v_bfe_i32 v20, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v2 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(5) -; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; SI-NEXT: v_bfe_i32 v2, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 -; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v11 -; SI-NEXT: v_bfe_i32 v42, v11, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v9 +; SI-NEXT: v_bfe_i32 v4, v9, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v8 +; SI-NEXT: v_bfe_i32 v6, v8, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v11 +; SI-NEXT: v_bfe_i32 v8, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 ; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(4) @@ -4143,47 +4148,47 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18 ; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v31 -; SI-NEXT: v_bfe_i32 v52, v31, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v30 -; SI-NEXT: v_bfe_i32 v30, v30, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33 -; SI-NEXT: v_bfe_i32 v54, v33, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v33 +; SI-NEXT: v_bfe_i32 v52, v33, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32 ; SI-NEXT: v_bfe_i32 v32, v32, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35 -; SI-NEXT: v_bfe_i32 v56, v35, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v35 +; SI-NEXT: v_bfe_i32 v54, v35, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34 ; SI-NEXT: v_bfe_i32 v34, v34, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37 -; SI-NEXT: v_bfe_i32 v58, v37, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v37 +; SI-NEXT: v_bfe_i32 v56, v37, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36 ; SI-NEXT: v_bfe_i32 v36, v36, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39 -; SI-NEXT: v_bfe_i32 v60, v39, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v39 +; SI-NEXT: v_bfe_i32 v58, v39, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38 ; SI-NEXT: v_bfe_i32 v38, v38, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41 -; SI-NEXT: v_bfe_i32 v62, v41, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v41 +; SI-NEXT: v_bfe_i32 v60, v41, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40 ; SI-NEXT: v_bfe_i32 v40, v40, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v43 +; SI-NEXT: v_bfe_i32 v62, v43, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v42 +; SI-NEXT: v_bfe_i32 v42, v42, 0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v0, v[42:43], v[62:63] offset0:14 offset1:15 +; SI-NEXT: ds_write2_b64 v0, v[40:41], v[60:61] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v0, v[38:39], v[58:59] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v0, v[36:37], v[56:57] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v0, v[34:35], v[54:55] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v0, v[32:33], v[52:53] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v0, v[16:17], v[48:49] offset1:1 ; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31 ; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23 -; SI-NEXT: ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[20:21] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload @@ -4200,22 +4205,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v28 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v13 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 ; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v14 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 @@ -4246,23 +4251,23 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v12, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v15, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v18 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v14, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v17, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v10, v16, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v12, v19, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 @@ -4313,22 +4318,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v28 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 ; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: s_nop 0 ; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v14 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 @@ -4359,23 +4364,23 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v12, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v15, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v18 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v17, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v19, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 @@ -4852,17 +4857,23 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 -; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v15 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v13 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19 +; VI-DS128-NEXT: v_mov_b32_e32 v23, v11 +; VI-DS128-NEXT: v_mov_b32_e32 v11, v3 +; VI-DS128-NEXT: v_mov_b32_e32 v3, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; VI-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill @@ -4890,7 +4901,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v14 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4900,23 +4911,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; VI-DS128-NEXT: v_mov_b32_e32 v23, v15 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; VI-DS128-NEXT: v_bfe_i32 v22, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v20, v14, 0, 16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 -; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; VI-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v25 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v24 +; VI-DS128-NEXT: v_bfe_i32 v14, v25, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v12, v24, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 @@ -4953,7 +4961,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:64 ; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload @@ -4967,7 +4975,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] +; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] ; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 ; VI-DS128-NEXT: s_endpgm ; @@ -4980,15 +4988,19 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v15 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v13 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19 +; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v11 +; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 @@ -5021,7 +5033,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v14 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -5031,22 +5043,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v12 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v15, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v14, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v25 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v24 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v25, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v24, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 @@ -5083,7 +5093,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload @@ -5097,7 +5107,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] +; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] ; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in @@ -5910,6 +5920,7 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v15, v5 ; SI-NEXT: v_mov_b32_e32 v17, v5 ; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v20, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 @@ -5919,73 +5930,72 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v0, v[14:15], v[18:19] offset1:1 +; SI-NEXT: ds_write2_b64 v20, v[8:9], v[6:7] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v20, v[12:13], v[4:5] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v20, v[10:11], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[18:19] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, 0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v4 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v4 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, 0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v3 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[2:3] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; VI-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[10:11], v[3:4] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[8:9] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[7:8], v[9:10] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v4 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[1:2] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, v3 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[6:7], v[1:2] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v4 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v4 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[5:6], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v12 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v8, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v2, v[3:4], v[8:9] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v6, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v2, v[12:13], v[7:8] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[0:1], v[4:5] offset1:1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v2, v[10:11], v[6:7] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v2, v[0:1], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v8i16_to_v8i64: @@ -6069,62 +6079,62 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 +; VI-DS128-NEXT: v_mov_b32_e32 v14, 0 +; VI-DS128-NEXT: v_mov_b32_e32 v16, v14 +; VI-DS128-NEXT: v_mov_b32_e32 v11, v14 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 -; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v17, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v8, v14 +; VI-DS128-NEXT: v_mov_b32_e32 v5, v14 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; VI-DS128-NEXT: v_mov_b32_e32 v1, 0 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-DS128-NEXT: ds_write_b128 v17, v[13:16] offset:48 +; VI-DS128-NEXT: v_mov_b32_e32 v13, v14 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; VI-DS128-NEXT: v_mov_b32_e32 v11, v1 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v1 -; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32 -; VI-DS128-NEXT: v_mov_b32_e32 v8, v1 -; VI-DS128-NEXT: v_mov_b32_e32 v10, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; VI-DS128-NEXT: ds_write_b128 v17, v[10:13] offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v10, v14 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 -; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v1 -; VI-DS128-NEXT: v_mov_b32_e32 v7, v1 -; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v14, v[4:7] +; VI-DS128-NEXT: ds_write_b128 v17, v[7:10] offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v7, v14 +; VI-DS128-NEXT: ds_write_b128 v17, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DS128-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v11 -; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-DS128-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-DS128-NEXT: s_mov_b32 s1, 0xffff -; GFX9-DS128-NEXT: v_mov_b32_e32 v14, s0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v17, s0 +; GFX9-DS128-NEXT: s_mov_b32 s0, 0xffff +; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v14 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX9-DS128-NEXT: v_and_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v3 +; GFX9-DS128-NEXT: v_and_b32_sdwa v15, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; GFX9-DS128-NEXT: ds_write_b128 v17, v[13:16] offset:48 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-DS128-NEXT: v_and_b32_sdwa v12, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX9-DS128-NEXT: v_and_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:48 -; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-DS128-NEXT: v_and_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DS128-NEXT: ds_write_b128 v17, v[10:13] offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v11 -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v14, v[4:7] +; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DS128-NEXT: ds_write_b128 v17, v[7:10] offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-DS128-NEXT: ds_write_b128 v17, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(3) %in %ext = zext <8 x i16> %load to <8 x i64> @@ -6140,31 +6150,31 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; SI-NEXT: v_mov_b32_e32 v16, s0 +; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v16, v6, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; SI-NEXT: ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v16, v[0:1], v[12:13] offset1:1 +; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; SI-NEXT: ds_write2_b64 v18, v[12:13], v[6:7] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v18, v[8:9], v[4:5] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v18, v[0:1], v[14:15] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64: @@ -6408,46 +6418,46 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v14, v8 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v1 +; SI-NEXT: v_mov_b32_e32 v21, s0 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mov_b32_e32 v13, v9 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_mov_b32_e32 v17, v9 -; SI-NEXT: v_mov_b32_e32 v20, s0 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 -; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11 -; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: ds_write2_b64 v21, v[15:16], v[13:14] offset0:10 offset1:11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 -; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 -; SI-NEXT: ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v6 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v5, v9 -; SI-NEXT: ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v19, v9 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_mov_b32_e32 v4, v9 -; SI-NEXT: ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v20, v[7:8], v[3:4] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: ds_write2_b64 v21, v[16:17], v[11:12] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; SI-NEXT: ds_write2_b64 v21, v[17:18], v[9:10] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: ds_write2_b64 v21, v[5:6], v[7:8] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v21, v[17:18], v[19:20] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v21, v[15:16], v[1:2] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v21, v[9:10], v[3:4] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v21, v[13:14], v[11:12] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: @@ -6455,47 +6465,47 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v17, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v8 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[11:12], v[13:14] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[11:12], v[13:14] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v4 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[5:6], v[15:16] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[13:14], v[7:8] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[3:4], v[11:12] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[2:3], v[5:6] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[1:2], v[10:11] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[11:12] offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v17, v[0:1], v[9:10] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: @@ -6503,48 +6513,46 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v8 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v17, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[13:14], v[9:10] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v4 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[9:10], v[13:14] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[5:6], v[15:16] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[13:14], v[7:8] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[3:4], v[9:10] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[2:3], v[5:6] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[1:2], v[12:13] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[0:1], v[13:14] offset1:1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v17, v[0:1], v[11:12] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v16i16_to_v16i64: @@ -6701,105 +6709,105 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 -; VI-DS128-NEXT: v_mov_b32_e32 v26, 0 -; VI-DS128-NEXT: v_mov_b32_e32 v22, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v24, v26 +; VI-DS128-NEXT: v_mov_b32_e32 v27, 0 +; VI-DS128-NEXT: v_mov_b32_e32 v23, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v25, v27 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v5, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v5 -; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v11, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v19, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v8, v26 +; VI-DS128-NEXT: ds_read_b128 v[17:20], v5 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v29, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v15, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v11, v27 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14 -; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v26 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v18 +; VI-DS128-NEXT: v_mov_b32_e32 v18, s0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64 -; VI-DS128-NEXT: v_mov_b32_e32 v21, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32 -; VI-DS128-NEXT: v_mov_b32_e32 v10, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112 -; VI-DS128-NEXT: v_mov_b32_e32 v16, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v18, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v1, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v3, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v28, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v7, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96 -; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80 -; VI-DS128-NEXT: ds_write_b128 v14, v[4:7] +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; VI-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; VI-DS128-NEXT: ds_write_b128 v18, v[22:25] offset:64 +; VI-DS128-NEXT: v_mov_b32_e32 v20, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v22, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v1, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v3, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v17, v27 +; VI-DS128-NEXT: ds_write_b128 v18, v[19:22] offset:112 +; VI-DS128-NEXT: ds_write_b128 v18, v[0:3] offset:96 +; VI-DS128-NEXT: ds_write_b128 v18, v[26:29] offset:80 +; VI-DS128-NEXT: v_mov_b32_e32 v13, v27 +; VI-DS128-NEXT: ds_write_b128 v18, v[14:17] offset:48 +; VI-DS128-NEXT: ds_write_b128 v18, v[10:13] offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v8, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v10, v27 +; VI-DS128-NEXT: ds_write_b128 v18, v[7:10] offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v5, v27 +; VI-DS128-NEXT: v_mov_b32_e32 v7, v27 +; VI-DS128-NEXT: ds_write_b128 v18, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DS128-NEXT: v_mov_b32_e32 v25, 0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v25 +; GFX9-DS128-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v22, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v28, v26 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v28, s0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v25 +; GFX9-DS128-NEXT: v_mov_b32_e32 v29, s0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v19, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v26 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112 -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96 -; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v25 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32 -; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5 -; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v25 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[8:11] +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v5 +; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v26 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[21:24] offset:112 +; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v26 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[4:7] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[0:3] offset:96 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[25:28] offset:80 +; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v26 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[18:21] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[14:17] offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[11:14] offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v26 +; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v26 +; GFX9-DS128-NEXT: ds_write_b128 v29, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i64> @@ -6816,55 +6824,55 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 -; SI-NEXT: v_mov_b32_e32 v18, s0 +; SI-NEXT: v_mov_b32_e32 v23, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v11, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v13, v13, 0, 16 +; SI-NEXT: v_bfe_i32 v15, v12, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v3 +; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1 +; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; SI-NEXT: ds_write2_b64 v23, v[11:12], v[8:9] offset0:14 offset1:15 +; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1 +; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; SI-NEXT: v_bfe_i32 v1, v20, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v14, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; SI-NEXT: ds_write2_b64 v23, v[18:19], v[16:17] offset0:10 offset1:11 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; SI-NEXT: v_bfe_i32 v17, v4, 0, 16 +; SI-NEXT: ds_write2_b64 v23, v[10:11], v[7:8] offset0:6 offset1:7 ; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; SI-NEXT: v_bfe_i32 v12, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; SI-NEXT: v_bfe_i32 v12, v14, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v2, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 -; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 +; SI-NEXT: ds_write2_b64 v23, v[5:6], v[8:9] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v23, v[21:22], v[3:4] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v23, v[19:20], v[1:2] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v23, v[10:11], v[15:16] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v23, v[17:18], v[13:14] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6875,59 +6883,58 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v23, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v11, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v4, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v9, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[17:18], v[15:16] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v9, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[19:20], v[4:5] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v13, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[3:4], v[17:18] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[19:20], v[16:17] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[5:6], v[14:15] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[21:22], v[12:13] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[3:4], v[10:11] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v23, v[0:1], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6937,59 +6944,58 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v23, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v15, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v19, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[17:18], v[15:16] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v9, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[19:20], v[4:5] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[3:4], v[17:18] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[19:20], v[16:17] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[5:6], v[14:15] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[21:22], v[12:13] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[3:4], v[10:11] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v23, v[0:1], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i64: @@ -7168,124 +7174,124 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b128 v[3:6], v0 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_mov_b32_e32 v18, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 -; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v15, v10 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 -; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; VI-DS128-NEXT: v_bfe_i32 v16, v12, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v5, s0 +; VI-DS128-NEXT: v_bfe_i32 v18, v11, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v5, v[14:17] offset:80 +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 +; VI-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; VI-DS128-NEXT: v_bfe_i32 v7, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; VI-DS128-NEXT: ds_write_b128 v5, v[16:19] offset:64 +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-DS128-NEXT: v_bfe_i32 v22, v6, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v9, v8, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v10, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v20, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; VI-DS128-NEXT: ds_write_b128 v5, v[14:17] offset:112 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-DS128-NEXT: ds_write_b128 v5, v[22:25] offset:96 +; VI-DS128-NEXT: ds_write_b128 v5, v[18:21] offset:48 +; VI-DS128-NEXT: ds_write_b128 v5, v[0:3] offset:32 +; VI-DS128-NEXT: ds_write_b128 v5, v[11:14] offset:16 +; VI-DS128-NEXT: ds_write_b128 v5, v[7:10] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v11, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[14:17] offset:80 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[16:19] offset:64 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v10, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v3, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[14:17] offset:112 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[22:25] offset:96 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[18:21] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[0:3] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[11:14] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v5, v[7:10] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i64> @@ -7298,79 +7304,88 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2_b64 v[2:5], v0 offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: ds_read2_b64 v[6:9], v0 offset1:1 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v22, s0 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[14:17], v0 offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11 +; SI-NEXT: ds_read2_b64 v[5:8], v13 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: ds_read2_b64 v[0:3], v13 offset1:1 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: ds_read2_b64 v[9:12], v13 offset0:4 offset1:5 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v8 +; SI-NEXT: v_mov_b32_e32 v26, s0 +; SI-NEXT: ds_read2_b64 v[13:16], v13 offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v26, v[21:22], v[19:20] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: ds_write2_b64 v26, v[19:20], v[17:18] offset0:10 offset1:11 ; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v9 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v3 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: ds_write2_b64 v26, v[17:18], v[20:21] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v1 +; SI-NEXT: ds_write2_b64 v26, v[20:21], v[18:19] offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v17 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31 -; SI-NEXT: v_mov_b32_e32 v18, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_mov_b32_e32 v20, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v15 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v2 -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 -; SI-NEXT: ds_write2_b64 v22, v[6:7], v[4:5] offset1:1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v6, v1 -; SI-NEXT: ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v16, v1 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: ds_write2_b64 v26, v[16:17], v[21:22] offset0:30 offset1:31 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: ds_write2_b64 v26, v[19:20], v[17:18] offset0:26 offset1:27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; SI-NEXT: ds_write2_b64 v26, v[16:17], v[20:21] offset0:22 offset1:23 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: ds_write2_b64 v26, v[7:8], v[17:18] offset0:12 offset1:13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: ds_write2_b64 v26, v[16:17], v[6:7] offset0:8 offset1:9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: ds_write2_b64 v26, v[2:3], v[7:8] offset0:4 offset1:5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: ds_write2_b64 v26, v[6:7], v[3:4] offset0:18 offset1:19 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: ds_write2_b64 v26, v[24:25], v[22:23] offset1:1 +; SI-NEXT: ds_write2_b64 v26, v[20:21], v[14:15] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v26, v[18:19], v[16:17] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v26, v[11:12], v[1:2] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v26, v[9:10], v[5:6] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: @@ -7384,74 +7399,77 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v23, v5 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset1:1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[20:21] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[22:23], v[1:2] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(4) +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[0:1] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[20:21], v[0:1] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[0:1] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[8:9], v[0:1] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[6:7], v[0:1] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[8:9] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[6:7], v[0:1] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[11:12], v[8:9] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[9:10], v[7:8] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[4:5], v[15:16] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[13:14], v[8:9] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[7:8], v[12:13] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[11:12], v[6:7] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: @@ -7460,76 +7478,79 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v23, v5 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[20:21] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[22:23], v[1:2] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[0:1] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[20:21], v[0:1] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[18:19], v[0:1] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[8:9], v[0:1] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(6) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[6:7], v[0:1] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[17:18], v[8:9] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[6:7], v[0:1] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[15:16], v[8:9] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[9:10], v[7:8] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[4:5], v[15:16] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[12:13], v[8:9] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[7:8], v[14:15] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[11:12], v[6:7] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i64: @@ -7833,95 +7854,95 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v1, s1 -; VI-DS128-NEXT: ds_read_b128 v[3:6], v1 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v52, s0 +; VI-DS128-NEXT: ds_read_b128 v[9:12], v1 +; VI-DS128-NEXT: ds_read_b128 v[13:16], v1 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v53, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v12 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v15 +; VI-DS128-NEXT: ds_read_b128 v[13:16], v1 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[30:33], v1 offset:48 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v9 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; VI-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; VI-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v13 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v15 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; VI-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v31, 0 -; VI-DS128-NEXT: v_mov_b32_e32 v49, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v51, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 -; VI-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 -; VI-DS128-NEXT: v_mov_b32_e32 v46, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v48, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v27, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v29, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 -; VI-DS128-NEXT: v_mov_b32_e32 v43, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v45, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 -; VI-DS128-NEXT: v_mov_b32_e32 v24, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v26, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 -; VI-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 -; VI-DS128-NEXT: v_mov_b32_e32 v40, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v42, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 -; VI-DS128-NEXT: v_mov_b32_e32 v21, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v23, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 -; VI-DS128-NEXT: v_mov_b32_e32 v37, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v39, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 -; VI-DS128-NEXT: v_mov_b32_e32 v18, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v20, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 -; VI-DS128-NEXT: v_mov_b32_e32 v8, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v10, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 -; VI-DS128-NEXT: v_mov_b32_e32 v34, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v36, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 -; VI-DS128-NEXT: v_mov_b32_e32 v15, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v17, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; VI-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 -; VI-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v7, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v33, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 -; VI-DS128-NEXT: v_mov_b32_e32 v12, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v14, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v1, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v3, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 -; VI-DS128-NEXT: ds_write_b128 v52, v[11:14] -; VI-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; VI-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; VI-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v33 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v32, 0 +; VI-DS128-NEXT: v_mov_b32_e32 v50, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v52, v32 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v30 +; VI-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v30 +; VI-DS128-NEXT: ds_write_b128 v53, v[49:52] offset:240 +; VI-DS128-NEXT: v_mov_b32_e32 v47, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v49, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[46:49] offset:192 +; VI-DS128-NEXT: v_mov_b32_e32 v44, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v46, v32 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v16 +; VI-DS128-NEXT: ds_write_b128 v53, v[43:46] offset:160 +; VI-DS128-NEXT: v_mov_b32_e32 v41, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v43, v32 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v14 +; VI-DS128-NEXT: v_mov_b32_e32 v14, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v16, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[40:43] offset:176 +; VI-DS128-NEXT: v_mov_b32_e32 v38, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v40, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[13:16] offset:208 +; VI-DS128-NEXT: ds_write_b128 v53, v[37:40] offset:128 +; VI-DS128-NEXT: v_mov_b32_e32 v35, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v37, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v28, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v30, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v11, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v13, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[34:37] offset:144 +; VI-DS128-NEXT: ds_write_b128 v53, v[27:30] offset:96 +; VI-DS128-NEXT: v_mov_b32_e32 v25, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v27, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[10:13] offset:112 +; VI-DS128-NEXT: ds_write_b128 v53, v[24:27] offset:64 +; VI-DS128-NEXT: v_mov_b32_e32 v22, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v24, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v34, v32 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; VI-DS128-NEXT: ds_write_b128 v53, v[31:34] offset:224 +; VI-DS128-NEXT: v_mov_b32_e32 v18, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v20, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v7, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v9, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[21:24] offset:80 +; VI-DS128-NEXT: ds_write_b128 v53, v[17:20] offset:32 +; VI-DS128-NEXT: ds_write_b128 v53, v[6:9] offset:48 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v6, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[3:6] +; VI-DS128-NEXT: v_mov_b32_e32 v1, v32 +; VI-DS128-NEXT: v_mov_b32_e32 v3, v32 +; VI-DS128-NEXT: ds_write_b128 v53, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64: @@ -7929,95 +7950,95 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v1 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v52, s0 +; GFX9-DS128-NEXT: ds_read_b128 v[13:16], v1 +; GFX9-DS128-NEXT: ds_read_b128 v[17:20], v1 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v53, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v18 +; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v19 +; GFX9-DS128-NEXT: ds_read_b128 v[17:20], v1 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[30:33], v1 offset:48 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v19 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GFX9-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v49, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v51, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 -; GFX9-DS128-NEXT: v_mov_b32_e32 v46, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v48, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v29, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 -; GFX9-DS128-NEXT: v_mov_b32_e32 v43, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v45, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 -; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v26, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 -; GFX9-DS128-NEXT: v_mov_b32_e32 v40, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v42, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 -; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 -; GFX9-DS128-NEXT: v_mov_b32_e32 v37, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v39, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 -; GFX9-DS128-NEXT: v_mov_b32_e32 v34, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 -; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v33, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 -; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[11:14] -; GFX9-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v31 +; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v31 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v33 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v32, 0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v50, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v52, v32 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v30 +; GFX9-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v30 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[49:52] offset:240 +; GFX9-DS128-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v49, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[46:49] offset:192 +; GFX9-DS128-NEXT: v_mov_b32_e32 v44, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX9-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v20 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[43:46] offset:160 +; GFX9-DS128-NEXT: v_mov_b32_e32 v41, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v43, v32 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[40:43] offset:176 +; GFX9-DS128-NEXT: v_mov_b32_e32 v38, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[17:20] offset:208 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[37:40] offset:128 +; GFX9-DS128-NEXT: v_mov_b32_e32 v35, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v37, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v28, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[34:37] offset:144 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[27:30] offset:96 +; GFX9-DS128-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[14:17] offset:112 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[24:27] offset:64 +; GFX9-DS128-NEXT: v_mov_b32_e32 v22, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v34, v32 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[31:34] offset:224 +; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[21:24] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[10:13] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[6:9] offset:48 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[3:6] +; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v32 +; GFX9-DS128-NEXT: ds_write_b128 v53, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i64> @@ -8035,104 +8056,104 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 -; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 -; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_mov_b32_e32 v18, v7 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 -; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: v_mov_b32_e32 v7, s0 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_mov_b32_e32 v13, v7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5 +; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v7 +; SI-NEXT: v_bfe_i32 v20, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v13, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v18, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v7, v[21:22], v[18:19] offset0:14 offset1:15 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5 -; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11 -; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3 ; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v3 +; SI-NEXT: ds_write2_b64 v7, v[20:21], v[16:17] offset0:10 offset1:11 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1 +; SI-NEXT: v_bfe_i32 v20, v1, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[18:19], v[21:22] offset0:6 offset1:7 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1 -; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3 -; SI-NEXT: s_waitcnt lgkmcnt(5) +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_mov_b32_e32 v1, v11 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11 ; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v11 +; SI-NEXT: ds_write2_b64 v7, v[20:21], v[16:17] offset0:2 offset1:3 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v11 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; SI-NEXT: v_bfe_i32 v20, v9, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[18:19], v[21:22] offset0:30 offset1:31 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9 -; SI-NEXT: v_bfe_i32 v18, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27 -; SI-NEXT: s_waitcnt lgkmcnt(6) +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_mov_b32_e32 v1, v15 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23 +; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v15 +; SI-NEXT: ds_write2_b64 v7, v[20:21], v[16:17] offset0:26 offset1:27 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v15 +; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 ; SI-NEXT: v_bfe_i32 v17, v13, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[18:19], v[21:22] offset0:22 offset1:23 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v6, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: ds_write2_b64 v7, v[19:20], v[5:6] offset0:12 offset1:13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v4, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_bfe_i32 v1, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[5:6], v[3:4] offset0:8 offset1:9 +; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v13, 0, 16 +; SI-NEXT: v_bfe_i32 v9, v11, 0, 16 +; SI-NEXT: ds_write2_b64 v7, v[15:16], v[1:2] offset0:4 offset1:5 ; SI-NEXT: v_bfe_i32 v1, v12, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v14, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v8, 0, 16 -; SI-NEXT: v_bfe_i32 v8, v10, 0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_bfe_i32 v9, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v11, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 -; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v11, v14, 0, 16 +; SI-NEXT: v_bfe_i32 v13, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v15, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v17, v17, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v18, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v0, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 -; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v7, v[21:22], v[19:20] offset1:1 +; SI-NEXT: ds_write2_b64 v7, v[15:16], v[17:18] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v7, v[13:14], v[9:10] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v7, v[11:12], v[5:6] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v7, v[1:2], v[3:4] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: @@ -8140,224 +8161,229 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v10, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v14, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v13, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v12 offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[20:21], v[18:19] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v9, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[15:16], v[17:18] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(4) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[21:22], v[19:20] offset0:26 offset1:27 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(6) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v8, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v15, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[7:8], v[20:21] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[6:7], v[16:17] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v14, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[18:19] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v3, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[5:6] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[20:21], v[4:5] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v1, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[3:4] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v10 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v8, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v13 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[2:3] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v8, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[21:22], v[11:12] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[19:20], v[17:18] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[7:8] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[15:16] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[1:2], v[9:10] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v12 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[20:21], v[18:19] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v9, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[15:16], v[17:18] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[21:22], v[19:20] offset0:26 offset1:27 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[16:17] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[7:8], v[20:21] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[6:7], v[16:17] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[18:19] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v13 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v8, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[18:19], v[5:6] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[20:21], v[4:5] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v11 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[18:19] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[11:12], v[4:5] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v11, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[21:22], v[11:12] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[19:20], v[17:18] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[7:8] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[15:16] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[9:10] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i64: @@ -8708,114 +8734,112 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v13, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[9:12], v13 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[13:16], v13 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 -; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; VI-DS128-NEXT: v_bfe_i32 v19, v17, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v17, v2, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v2, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224 -; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_bfe_i32 v23, v2, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240 -; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(5) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v12 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176 -; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:224 +; VI-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:192 +; VI-DS128-NEXT: v_bfe_i32 v17, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; VI-DS128-NEXT: ds_write_b128 v8, v[21:24] offset:240 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:208 +; VI-DS128-NEXT: v_bfe_i32 v19, v11, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v12 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; VI-DS128-NEXT: ds_write_b128 v8, v[19:22] offset:160 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:176 +; VI-DS128-NEXT: v_bfe_i32 v19, v9, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_bfe_i32 v11, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-DS128-NEXT: ds_write_b128 v8, v[19:22] offset:128 +; VI-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:144 +; VI-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v7 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-DS128-NEXT: ds_write_b128 v8, v[19:22] offset:96 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:112 +; VI-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v16 +; VI-DS128-NEXT: v_bfe_i32 v9, v13, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v8, v[19:22] offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v14, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v20 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112 -; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64 -; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80 -; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48 +; VI-DS128-NEXT: ds_write_b128 v8, v[17:20] offset:80 +; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:32 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:48 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm @@ -8824,114 +8848,115 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v8 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v8 offset:32 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v13 -; GFX9-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[13:16], v8 +; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v8 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v17, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v6, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_bfe_i32 v23, v6, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:224 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192 -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:192 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[21:24] offset:240 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v5, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:208 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v2, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[19:22] offset:160 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:176 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[0:3] offset:144 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[19:22] offset:128 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v10, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[0:3] offset:112 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[19:22] offset:96 +; GFX9-DS128-NEXT: v_bfe_i32 v7, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v8, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[5:8] offset:64 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v9, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v26, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v14, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 31, v26 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9] +; GFX9-DS128-NEXT: ds_write_b128 v12, v[24:27] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[20:23] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[16:19] ; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir index 3b3ea3f37db80..f7c6531d75a40 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir @@ -797,8 +797,6 @@ body: | ; GFX908-NEXT: [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 @@ -825,6 +823,8 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode @@ -2281,6 +2281,8 @@ body: | ; GFX90A-NEXT: [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode + ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode ; GFX90A-NEXT: [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -2475,8 +2477,6 @@ body: | ; GFX90A-NEXT: [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF - ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode - ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1: ; GFX90A-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index ca16e251d51cf..083b75781cd61 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -5888,22 +5888,22 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -5922,16 +5922,16 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -5954,8 +5954,8 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 @@ -5974,7 +5974,7 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -10941,357 +10941,357 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v13 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v28 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v31, v33, v32, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v11 -; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v26 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v51, v51 -; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v48, v39, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v69, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 -; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v35, v32, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v33 -; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v70, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v38, v34, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v71, 16, v23 +; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v81, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v83, 0xffff0000, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v36 -; GFX10-NEXT: v_cndmask_b32_e32 v35, v39, v33, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v34 -; GFX10-NEXT: v_cmp_gt_f32_e64 s5, v31, v38 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v26 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v53, v52, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v35 -; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v39, v48 -; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v31, v31 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v22 +; GFX10-NEXT: v_cndmask_b32_e32 v35, v37, v33, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v36 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v86, v86 +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v33 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v51, v52 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v86, v86 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v37 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v8 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v86, v86 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e32 v38, v64, v55, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v86, v86 +; GFX10-NEXT: v_cmp_gt_f32_e64 s6, v65, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v38 +; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v48, v66, v64, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v87, v39, v37, s6 +; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v86, v86 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v64, v64, v48, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v48 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v86, v86 +; GFX10-NEXT: v_cndmask_b32_e32 v66, v80, v71, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v64 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v66 +; GFX10-NEXT: v_cndmask_b32_e32 v71, v71, v66, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX10-NEXT: v_cmp_gt_f32_e64 s8, v70, v80 +; GFX10-NEXT: v_lshrrev_b32_e32 v70, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v71 +; GFX10-NEXT: v_cndmask_b32_e32 v83, v85, v84, vcc_lo ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v49, v50 -; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v52, v38, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39 -; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v38 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v50, v49, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v31, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v39 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v49, v39, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v52, v52 -; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v54, v53, s6 -; GFX10-NEXT: v_cmp_gt_f32_e64 s6, v51, v55 -; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v52, v52 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v50 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v53, v49, s7 -; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v55, v55 -; GFX10-NEXT: v_cmp_gt_f32_e64 s8, v31, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v65, v64, s7 -; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 +; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX10-NEXT: v_cndmask_b32_e64 v52, v84, v83, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v83 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v49, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v52 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v51, v50, s5 +; GFX10-NEXT: v_cmp_gt_f32_e64 s5, v53, v54 +; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX10-NEXT: v_cmp_gt_f32_e64 s10, v85, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v49, s7 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v49 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v53, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v53, v64, v55, s7 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v65, v65 -; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v53 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v67, v66, s7 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v52 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v64, v64 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v65 -; GFX10-NEXT: v_cmp_gt_f32_e64 s9, v54, v67 -; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v66, v65, s7 +; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v53, v65, v54, s7 ; GFX10-NEXT: v_cmp_gt_f32_e64 s7, v68, v69 -; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v54, v54 -; GFX10-NEXT: v_lshrrev_b32_e32 v69, 16, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v64 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v67, v66, s10 -; GFX10-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v68, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v70, v69, s10 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v67, v67 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v54 -; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v66, v66, v54, s10 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v71, v71 -; GFX10-NEXT: v_lshrrev_b32_e32 v71, 16, v19 -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v69, v68, s10 -; GFX10-NEXT: v_and_b32_e32 v69, 0xffff0000, v3 -; GFX10-NEXT: v_cmp_gt_f32_e64 s11, v70, v81 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v67 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v69, v69 -; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX10-NEXT: v_cmp_gt_f32_e64 s12, v82, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v71, s10 -; GFX10-NEXT: v_cmp_gt_f32_e64 s10, v31, v51 -; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 -; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v18 -; GFX10-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v69 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v71, v69, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v70, v70 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v81, v80, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v82, v82 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v17 -; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v80, v80, v70, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v71, v71 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v82, v81, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v71, v71 -; GFX10-NEXT: v_cmp_gt_f32_e64 s13, v31, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v70 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v71, v81, v82, s14 -; GFX10-NEXT: v_cmp_gt_f32_e64 s14, v31, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v82 -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v71 -; GFX10-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX10-NEXT: v_cmp_gt_f32_e64 s15, v31, v81 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v16 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v31, v31 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v83, v83, v81, s16 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v31, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v81, v81, v83, s16 +; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v69, 16, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v54, v54, v53, s9 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v53 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v68, v68 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v68, v70, v69, s9 +; GFX10-NEXT: v_cmp_gt_f32_e64 s9, v82, v81 +; GFX10-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v18 +; GFX10-NEXT: v_cmp_gt_f32_e64 s12, v65, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v69, v69, v68, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v68 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v81, v81 +; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v81, v84, v82, s11 +; GFX10-NEXT: v_cmp_gt_f32_e64 s11, v51, v67 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v80, v82, v81, s13 ; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v81 -; GFX10-NEXT: v_cmp_gt_f32_e64 s16, v31, v84 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v14 -; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v30 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v31, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v84, s17 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v51, v65, v67, s13 +; GFX10-NEXT: v_cmp_gt_f32_e64 s13, v70, v85 +; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v17 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v16 +; GFX10-NEXT: v_cmp_gt_f32_e64 s14, v84, v82 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v84, 0xffff0000, v16 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v70, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v51 +; GFX10-NEXT: v_cndmask_b32_e64 v70, v82, v85, s15 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v84, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v67, v67, v51, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v84, v85, v70, s15 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v84 +; GFX10-NEXT: v_cmp_gt_f32_e64 s15, v65, v82 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX10-NEXT: v_cmp_gt_f32_e64 s16, v85, v86 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v14 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v65, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v96, v84, v70, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v85, v82, s17 ; GFX10-NEXT: v_and_b32_e32 v85, 0xffff0000, v30 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v85, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v84, v84, v31, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v84 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v82, v82, v65, s17 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v82 ; GFX10-NEXT: v_cmp_gt_f32_e64 s17, v85, v86 -; GFX10-NEXT: v_lshrrev_b32_e32 v86, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v85, v84, v31, s17 -; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v31, s17 -; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v31, v84, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v85 -; GFX10-NEXT: v_cmp_eq_f32_e64 s17, 0, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v84, v37, v32, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v32 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v31, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v84, v32, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v37 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v37, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v36, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v86, v36, v34, s4 ; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0, v34 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v39 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v37, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v85, v82, v65, s17 +; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v85, v65, s17 +; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0, v82 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v65, v82, s17 +; GFX10-NEXT: v_cndmask_b32_e32 v82, v32, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e32 v31, v82, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v82 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v86, v34, s4 ; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0, v36 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v36, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v35, v33, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33 -; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0, v38 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v36 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX10-NEXT: v_cndmask_b32_e64 v35, v48, v38, s6 -; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v38, v35, v38, s4 -; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v48, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v35 -; GFX10-NEXT: v_cmp_eq_f32_e64 s4, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v50, v39, s8 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0, v65 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v48, v39, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v50 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v50, s5 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v48 -; GFX10-NEXT: v_cmp_eq_f32_e64 s5, 0, v50 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v52, v49, s9 -; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v50, v49, s6 -; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v52, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v50 -; GFX10-NEXT: v_cmp_eq_f32_e64 s6, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v53, v55, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v82, v32, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v34, v31, v36, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v86 +; GFX10-NEXT: v_cndmask_b32_e64 v36, v35, v33, s5 +; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v33 +; GFX10-NEXT: v_cmp_eq_f32_e64 s4, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v36, v33, s5 +; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0, v35 +; GFX10-NEXT: v_cndmask_b32_e64 v33, v86, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v35, v31, v35, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v36 +; GFX10-NEXT: v_cmp_eq_f32_e64 s5, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, v37, s6 +; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0, v39 +; GFX10-NEXT: v_cndmask_b32_e64 v34, v36, v35, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v31, v39, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v87 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v55, v38, s7 +; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0, v38 +; GFX10-NEXT: v_cmp_eq_f32_e64 s6, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v39, v38, s7 ; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v52, v55, s7 -; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0, v53 -; GFX10-NEXT: v_cndmask_b32_e64 v53, v55, v53, s7 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v52 -; GFX10-NEXT: v_cmp_eq_f32_e64 s7, 0, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v64, v65, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v69 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v52, v53, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v55, v65, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v35, v87, v37, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v38, v31, v55, s7 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e64 v55, v64, v48, s8 +; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0, v48 +; GFX10-NEXT: v_cmp_eq_f32_e64 s7, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v55, v48, s8 ; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0, v64 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v65, v64, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v66, v54, s11 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0, v54 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v54, s8 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v54, v66, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v65 -; GFX10-NEXT: v_cmp_eq_f32_e64 s8, 0, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v66, v67, v68, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v66, v68, s9 -; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0, v67 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v68, v67, s9 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v66 -; GFX10-NEXT: v_cmp_eq_f32_e64 s9, 0, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v51, v69, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v68, v69, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v69, v51, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v70, s14 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v70 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v69, v70, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v70, v80, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v80, v71, v82, s15 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v82 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v80, v82, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v71 -; GFX10-NEXT: v_cndmask_b32_e64 v71, v82, v71, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v81, v83, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v36, v39, v38, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v48, v31, v64, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e64 v64, v71, v66, s9 +; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0, v66 +; GFX10-NEXT: v_cmp_eq_f32_e64 s8, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v64, v66, s9 +; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0, v71 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v55, v48, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v66, v31, v71, s9 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v64 +; GFX10-NEXT: v_cndmask_b32_e64 v71, v52, v83, s10 ; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v82 -; GFX10-NEXT: v_cndmask_b32_e64 v83, v82, v83, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v81 -; GFX10-NEXT: v_cndmask_b32_e64 v81, v83, v81, s10 -; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v85, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v14 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v85, v85 -; GFX10-NEXT: v_cndmask_b32_e64 v85, v14, v30, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v14, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v87, v30, v85, s11 -; GFX10-NEXT: v_cmp_eq_f32_e64 s12, 0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v35, v38, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v35, v50, v49, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v65, v54, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v84, v32, s12 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v37 -; GFX10-NEXT: v_and_b32_e32 v84, 0xffff0000, v15 -; GFX10-NEXT: v_cmp_eq_f32_e64 s12, 0, v32 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v37, v34, s12 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v48, v39, s5 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v68 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v69 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v55, v64, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX10-NEXT: v_cmp_eq_f32_e64 s9, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v71, v83, s10 +; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0, v52 +; GFX10-NEXT: v_cndmask_b32_e64 v83, v50, v49, s11 +; GFX10-NEXT: v_cmp_eq_u16_e64 s11, 0, v49 +; GFX10-NEXT: v_cndmask_b32_e64 v38, v64, v66, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v52, v31, v52, s10 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v71 +; GFX10-NEXT: v_cmp_eq_f32_e64 s10, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v83, v49, s11 +; GFX10-NEXT: v_cmp_eq_u16_e64 s11, 0, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v71, v52, s10 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v96 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v31, v50, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v83 +; GFX10-NEXT: v_cndmask_b32_e64 v50, v54, v53, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v53 +; GFX10-NEXT: v_cmp_eq_f32_e64 s11, 0, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v50, v53, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v48, v83, v49, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v53, v31, v54, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v54, v69, v68, s13 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v68 +; GFX10-NEXT: v_cmp_eq_u16_e64 s13, 0, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v87, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v54, v68, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v68, v31, v69, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v81, s14 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v81 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v69, v81, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v81, v67, v51, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v80, v31, v80, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v81 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v81, v51, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0, v67 +; GFX10-NEXT: v_cndmask_b32_e64 v51, v31, v67, s12 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v96, v70, s13 +; GFX10-NEXT: v_cmp_eq_u16_e64 s13, 0, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v70, v31, v84, s13 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v85 +; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v14 +; GFX10-NEXT: v_cmp_eq_f32_e64 s13, 0, v31 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v84, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v65, s13 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v66, v14, v30, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v30 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v85, v85 +; GFX10-NEXT: v_cndmask_b32_e64 v83, v30, v66, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v83 -; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v15, v83, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v66, v67, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v86, v50, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v64 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v50, v54, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v55, v83, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e32 v39, v68, v51, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e32 v48, v69, v70, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v53, v54, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v66 -; GFX10-NEXT: v_cndmask_b32_e32 v65, v55, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v50, v80, v71, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v54, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v65, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v65 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v67 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v67 +; GFX10-NEXT: v_cndmask_b32_e64 v84, v15, v67, s12 +; GFX10-NEXT: v_cndmask_b32_e32 v82, v82, v49, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v50, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v82 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v84 +; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v82, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX10-NEXT: v_cndmask_b32_e32 v53, v67, v84, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v87 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v53 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v54, v68, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v69, v80, vcc_lo +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v54, v49, v82, vcc_lo +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v84, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX10-NEXT: v_cndmask_b32_e32 v50, v81, v51, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82 +; GFX10-NEXT: v_cndmask_b32_e32 v51, v54, v82, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v84 +; GFX10-NEXT: v_cndmask_b32_e32 v64, v55, v84, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49 ; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v49, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v53 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v53, v64, v53, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX10-NEXT: v_cndmask_b32_e32 v49, v54, v49, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e32 v52, v82, v81, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v87 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v65, v54, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v85 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v52, v96, v70, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX10-NEXT: v_cndmask_b32_e32 v51, v55, v53, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v83 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v28 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v53 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v87, v85, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v53, v83, v66, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v85, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v66, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v55, v87, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v83 +; GFX10-NEXT: v_cndmask_b32_e32 v28, v55, v83, vcc_lo ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v64 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54 @@ -11322,13 +11322,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX10-NEXT: v_perm_b32 v13, v14, v13, 0x5040100 +; GFX10-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v53, v12, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v11 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; GFX10-NEXT: v_perm_b32 v14, v31, v28, 0x5040100 -; GFX10-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX10-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v25 @@ -11354,7 +11353,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v53 -; GFX10-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v53, v10, vcc_lo ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 ; GFX10-NEXT: v_cndmask_b32_e32 v27, v25, v9, vcc_lo @@ -11367,7 +11366,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v53, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25 -; GFX10-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 @@ -11389,7 +11388,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX10-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo @@ -11415,13 +11414,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22 -; GFX10-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 @@ -11433,7 +11432,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v5 -; GFX10-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v39, v6, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v21 @@ -11467,7 +11466,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; GFX10-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v5, v48, v5, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 @@ -11483,10 +11482,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v17 -; GFX10-NEXT: v_perm_b32 v3, v39, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX10-NEXT: v_perm_b32 v15, v49, v51, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v2 @@ -11530,10 +11530,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX10-NEXT: v_perm_b32 v2, v48, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v30, v2, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v15, v49, v51, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v14, v31, v28, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16: @@ -12157,365 +12157,377 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v52, v52, v51, s1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v38, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v30 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v48, v48, v39, s0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v80, v71, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v84, v83, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v96, v87, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v100, v99, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s2, v54, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v112, v103, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v116, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v64, v64, v55, s2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v128, v119, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v132, v131, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v96 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v147, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v48, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v39, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v49, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v48, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v49 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v38, v53 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v51, v50 :: v_dual_lshlrev_b32 v53, 16, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v48, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v52, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v66, v65 :: v_dual_and_b32 v66, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v55 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v69, v68 :: v_dual_and_b32 v70, 0xffff0000, v25 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v67 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v68, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v144, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v64, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v68, v69 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v82, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v96, v87, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 16, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v98 :: v_dual_and_b32 v96, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v99, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v81, v81, v70 :: v_dual_lshlrev_b32 v100, 16, v97 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v85, v87, v86 :: v_dual_lshlrev_b32 v96, 16, v81 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v85 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v98, v97 :: v_dual_lshlrev_b32 v98, 16, v86 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v84, v96 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v84, v81, v70 :: v_dual_lshlrev_b32 v101, 16, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v98, v99 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v96, v85, v86 :: v_dual_and_b32 v99, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v100, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v87, v97, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v99, v101, v100 :: v_dual_lshlrev_b32 v80, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v99, v84, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v100, v99, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v115, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v114, v101, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v114, v118, v117 :: v_dual_and_b32 v115, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v116, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v118, 16, v101 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v116, v100, v99 :: v_dual_lshlrev_b32 v119, 16, v113 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v115, v117, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v118, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v115 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v118, v113, v101, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v117, v129, v128, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v119, v130 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v119, v115, v114 :: v_dual_lshlrev_b32 v130, 16, v117 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v117, v131, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v119, v135, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, v49, v130 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v117 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v37, v129 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v129, v51, v52, s0 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v55 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v131 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v71 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v132 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 16, v87 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v81, v134 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v80, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v85, v135 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 16, v103 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v97, v144 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v101, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97 -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v112, v146 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v114, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v114, v115, v98, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v116, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v116, v117, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v118, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v118, v119, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v128, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v128, v38, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v133, v135, v134 :: v_dual_lshlrev_b32 v102, 16, v84 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v135, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v144, v146, v145, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v128, v128, v117 :: v_dual_lshlrev_b32 v147, 16, v144 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v132, v134, v133 :: v_dual_lshlrev_b32 v135, 16, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v134, v145, v144 :: v_dual_lshlrev_b32 v145, 16, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v130, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v134 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v130, v128, v117, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v145, v146 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v135, v132, v133, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v147, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v14 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v145, v134, v144 :: v_dual_lshlrev_b32 v148, 16, v30 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v37, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v48, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v38, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v147, v14, v30, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v148, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v148, v30, v147 :: v_dual_lshlrev_b32 v103, 16, v96 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v129, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v39, v52, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v129 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v70 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v69, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v80 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v81, v80, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v84 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v97, v84, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v86 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v101, v86, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v96 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v112, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v98 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v102 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v118, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v128, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v128 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v64, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v68, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v83 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v80, v83, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v52, v68, v55, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v67 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v55, v69, v67, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v64, v84, v70, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v86 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v67, v96, v86, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v97 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v70, v98, v97, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v99 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v85 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v86, v116, v99, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v101 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v85, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v82, v87, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v99 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v84, v99, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v103 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v97, v118, v101, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v114 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v70, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v99, v119, v114, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v117 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v86, v100, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v96, v113, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v101, v130, v117, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v133 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v97, v113, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v115 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v81 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v100, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v119 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v35, v119, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v114, v135, v133, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v144 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v99, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v117, v145, v144, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v49 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v101, v128, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v132 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v49, v14, v49, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v114, v132, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v135 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v87, v117, v134, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v34, v36, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v38, v37 :: v_dual_lshlrev_b32 v129, 16, v116 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v71 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v39, v49 :: v_dual_lshlrev_b32 v131, 16, v118 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v80 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v53, v54 :: v_dual_lshlrev_b32 v150, 16, v145 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v36, v68, v52 :: v_dual_and_b32 v53, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v83 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v129, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v53, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v65, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v133 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v81, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v135 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v85, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v144 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v97, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v145 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v101, v71, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v15, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v84, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v103 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v96, v65 :: v_dual_lshlrev_b32 v50, 16, v130 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v112 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v98, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v129 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v116, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v15, v31, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v116 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v54 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v146 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v112, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v118, v70, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v33, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52 -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v114, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v148 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v116, v83, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v64 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v52, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v67 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v64 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v53, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v65 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v118, v84, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v54 :: v_dual_lshlrev_b32 v64, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 16, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v149 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v119, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v130, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v64 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v52, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v50, v135, v86 :: v_dual_lshlrev_b32 v65, 16, v64 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v54, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v52 :: v_dual_lshlrev_b32 v54, 16, v55 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v128, v86, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v65, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v147 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v145, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v148, v147 :: v_dual_lshlrev_b32 v65, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v147 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v54, v147 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v148 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v148, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v54 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v28 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v29, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v54, v53 :: v_dual_lshlrev_b32 v64, 16, v55 ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v66, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v27 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v28, v12 :: v_dual_lshlrev_b32 v65, 16, v27 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 @@ -12533,33 +12545,32 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v54, v12 :: v_dual_lshlrev_b32 v55, 16, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v28 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v25 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v54, 16, v26 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v9 @@ -12567,64 +12578,63 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v25, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v26, 16, v8 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v24, v8 :: v_dual_lshlrev_b32 v29, 16, v7 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v23, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v23, v7 :: v_dual_lshlrev_b32 v28, 16, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v26, v8 :: v_dual_lshlrev_b32 v25, 16, v22 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc_lo @@ -12632,48 +12642,49 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v5 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v21 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v21, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v20, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v20, v4 :: v_dual_lshlrev_b32 v25, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4 @@ -12685,48 +12696,49 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v24, v3 :: v_dual_lshlrev_b32 v20, 16, v2 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v31, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v20, 16, v16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v16 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v20 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v16, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 @@ -12734,6 +12746,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo @@ -12742,20 +12755,19 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v2, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v51, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -13467,478 +13479,479 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v9 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v52, v52, v51, s1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v38, v38 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v9 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v30 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v48, v48, v39, s0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v30 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v80, v71, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v84, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v96, v87, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v25 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v100, v99, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v16 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s2, v54, v54 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v112, v103, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v116, v115, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v21 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v64, v64, v55, s2 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v24 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v8 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v9 ; GFX12-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v128, v119, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v20 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v132, v131, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v96 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v33, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v22 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v19 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v8 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v20 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v34, v35 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v18 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v16 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v147, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v17 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v48, v39, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v39, v50, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v50 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v49, v48, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v48, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v49 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v38, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v38, v51, v50 :: v_dual_lshlrev_b32 v53, 16, v52 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v80 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v48, v54, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v55 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 16, v84 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v16 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v66, v65 :: v_dual_and_b32 v66, 0xffff0000, v26 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v55 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v99, v99, v84, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v67, v69, v68 :: v_dual_and_b32 v70, 0xffff0000, v25 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v67 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v115, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v82 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v68, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v69 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v55 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v65 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v144, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v68, v69 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v117, v131, v100, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 16, v86 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v71 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v119, v135, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v66, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, v49, v130 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v82, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v70 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v30 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v70 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v117 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v96, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v37, v129 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v129, v51, v52, s0 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v98 :: v_dual_and_b32 v96, 0xffff0000, v23 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v99, 0xffff0000, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v55 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v131 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v81, v81, v70 :: v_dual_lshlrev_b32 v100, 16, v97 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v71 -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v132 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v85, v87, v86 :: v_dual_lshlrev_b32 v96, 16, v81 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v68 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v85 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v68, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v133 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 16, v87 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v98, v97 :: v_dual_lshlrev_b32 v98, 16, v86 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v84, v96 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65 -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v81, v134 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v84, v81, v70 :: v_dual_lshlrev_b32 v101, 16, v87 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v98, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v85, v135 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 16, v103 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v96, v85, v86 :: v_dual_and_b32 v99, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v100, v101 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v21 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v97, v144 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v87, v97, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98 -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v101, v145 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v115 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v99, v101, v100 :: v_dual_lshlrev_b32 v80, 16, v53 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v21 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v99 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v100, v99, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v20 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97 -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v112, v146 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v114, v147 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v119 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v114, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v114, v115, v98, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v116, v14 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v114, v101, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v116, v117, v100, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v118, v30 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v114, v118, v117 :: v_dual_and_b32 v115, 0xffff0000, v19 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v116, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v69 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v118, 16, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v118, v119, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v128, v49 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v116, v100, v99 :: v_dual_lshlrev_b32 v119, 16, v113 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v128, v38, v34, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v36 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v115, v117, v114, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v118, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v39 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v114 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v118, v113, v101, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v48, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v117, v129, v128, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v119, v130 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v129, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v129 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v119, v115, v114 :: v_dual_lshlrev_b32 v130, 16, v117 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v64, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v133, v135, v134 :: v_dual_lshlrev_b32 v102, 16, v84 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v135, 0xffff0000, v17 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v70 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v144, v146, v145, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v16 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v69, v70, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v80 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v128, v128, v117 :: v_dual_lshlrev_b32 v147, 16, v144 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v81, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v132, v134, v133 :: v_dual_lshlrev_b32 v135, 16, v128 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v84 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v134, v145, v144 :: v_dual_lshlrev_b32 v145, 16, v133 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v132 +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v130, v135 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v134 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v97, v84, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v86 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v130, v128, v117, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v145, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v101, v86, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v96 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v135, v132, v133, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v147, v148 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v98 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v14 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v112, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v98 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v145, v134, v144 :: v_dual_lshlrev_b32 v148, 16, v30 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v100 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v36 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v102 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v38, v50, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v118, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v51, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v147, v14, v30, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v148, v148 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v128, v34, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v148, v30, v147 :: v_dual_lshlrev_b32 v103, 16, v96 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v51, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v128 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v39, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v52, v68, v55, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v67 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v64, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v55, v69, v67, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v70 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v54, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v64, v84, v70, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v86 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v68, v67, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v65, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v67, v96, v86, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v97 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v83 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v81 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v70, v98, v97, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v80, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v85 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v86, v116, v99, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v82, v87, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v99 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v85, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v97, v118, v101, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v114 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v84, v99, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v103 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v70, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v100 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v99, v119, v114, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v117 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v86, v100, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v113 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v101, v130, v117, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v133 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v96, v113, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v97, v113, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v115 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v114, v135, v133, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v144 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v115, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v117 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v81 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v99, v115, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v128 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v15 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v117, v145, v144, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v49 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v100, v117, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v119 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v101, v128, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v132 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v49, v14, v49, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v35, v119, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v114, v132, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v134 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v135 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v87, v117, v134, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v14, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v34, v36, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v38, v37 :: v_dual_lshlrev_b32 v129, 16, v116 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v71 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v129, v39, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v34, v39, v49 :: v_dual_lshlrev_b32 v131, 16, v118 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v80 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v53, v49, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v132 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v31 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v35, v53, v54 :: v_dual_lshlrev_b32 v150, 16, v145 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v82 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v65, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v133 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v36, v68, v52 :: v_dual_and_b32 v53, 0xffff0000, v31 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v83 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v64, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v81, v67, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v135 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v84, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v103 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v85, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v144 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v39, v96, v65 :: v_dual_lshlrev_b32 v50, 16, v130 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v112 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v97, v70, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v145 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v98, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v129 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v101, v71, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v116, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v15, v31, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v15, v31, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v116 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v55 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v112 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v33 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v146 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v33 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v112, v80, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v118, v70, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v33, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v118 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52 -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v147 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v53 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v54 :: v_dual_lshlrev_b32 v64, 16, v52 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 16, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v53 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v149 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v114, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v148 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v119, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v116, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v64 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v130, v85, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v52, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v67 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v64 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v52, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v53, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v65 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v54, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v118, v84, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v50, v135, v86 :: v_dual_lshlrev_b32 v65, 16, v64 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v54, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v52 :: v_dual_lshlrev_b32 v54, 16, v55 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v67 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v53, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v148 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v13 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v150 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v128, v86, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v68 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v145, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v65, v53, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v66 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v29 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v51, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v12 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v53 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v54, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v13 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v148, v147 :: v_dual_lshlrev_b32 v65, 16, v13 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v28 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v53, v54, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v54, v147 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v148 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v148, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v54 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v29, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v54, v53 :: v_dual_lshlrev_b32 v64, 16, v55 ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v66, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v27 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v28, v12 :: v_dual_lshlrev_b32 v65, 16, v27 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo @@ -13960,31 +13973,33 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX12-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v54, v12 :: v_dual_lshlrev_b32 v55, 16, v26 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v25 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v54, 16, v26 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo @@ -13992,9 +14007,6 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 -; GFX12-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v9 @@ -14005,8 +14017,6 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54 @@ -14017,57 +14027,57 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v26, 16, v8 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25 +; GFX12-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX12-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX12-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v24, v24, v8 :: v_dual_lshlrev_b32 v29, 16, v7 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8 -; GFX12-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v23, v7, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v23, v7 :: v_dual_lshlrev_b32 v28, 16, v6 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v26, v8 :: v_dual_lshlrev_b32 v25, 16, v22 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo @@ -14075,11 +14085,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GFX12-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd @@ -14089,23 +14098,22 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX12-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd @@ -14113,14 +14121,17 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v21 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; GFX12-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -14129,10 +14140,8 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v21, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo @@ -14140,10 +14149,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v19 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v20, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v20, v4 :: v_dual_lshlrev_b32 v25, 16, v3 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo @@ -14153,56 +14161,59 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v19, v3, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v24, v3 :: v_dual_lshlrev_b32 v20, 16, v2 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX12-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_perm_b32 v3, v31, v3, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v20, 16, v16 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v19 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v16 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v20 @@ -14210,9 +14221,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v16, v0, vcc_lo @@ -14223,6 +14232,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17 @@ -14234,23 +14244,23 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v2, v32, v2, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 ; GFX12-FAKE16-NEXT: v_perm_b32 v15, v33, v51, 0x5040100 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 4f73e8e9c1883..40641f052b178 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -4817,77 +4817,77 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v17 +; GFX7-SDAG-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v23 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v21 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v16 -; GFX7-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v24 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v21, v23 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v20 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v18 ; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v19 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v28 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v29 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v30 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v21 ; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v20 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v16 ; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v18 ; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v19 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v17 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v16f16: @@ -4900,46 +4900,46 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v17 +; GFX7-GISEL-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v21 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v17 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v22 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v21 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v23 ; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v16 -; GFX7-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v23 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v22 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v19 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v24 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v25 +; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v8, v18 -; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v9, v19 -; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v10, v20 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v26 +; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v18 +; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v8, v19 +; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v9, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v27 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v27 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v28 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v28 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v29 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v29 +; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v10, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v30 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v30 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v12, v18 -; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v13, v19 -; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v14, v20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v11, v18 +; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v12, v19 +; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v13, v20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v14, v16 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -4956,8 +4956,8 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_max_f32_e32 v15, v15, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v15, v15, v17 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4966,27 +4966,26 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 ; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 ; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 @@ -5003,7 +5002,8 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v23, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v19, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v15 ; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v14 ; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v13 @@ -5012,13 +5012,13 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v10 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v9 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v19 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v24 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v22 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v20 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v18 ; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5965,97 +5965,102 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_max_f16_e32 v31, v0, v0 ; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v16, v16 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v1, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v31, v31, v32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v16, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v31, v31, v32 -; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v16, v1, v1 ; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v17, v17 +; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_max_f16_e32 v16, v33, v32 +; GFX8-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v17, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v16, v16, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v34, v2, v2 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v2, v2 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v18, v18 +; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v18, v18 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v18, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v17, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v3, v3 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v3, v3 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v19, v19 +; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v19, v19 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v19, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v18, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v33, v18 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v4, v4 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v3, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v4, v4 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v20, v20 +; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v20, v20 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v20, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v19, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v33, v19 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v5, v5 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v4, v4, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v5, v5 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v21, v21 +; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v21, v21 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v21, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v20, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v33, v20 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v6, v6 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v5, v5, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v6, v6 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v22, v22 +; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v22, v22 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v22, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v21, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v33, v21 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v7, v7 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v6, v6, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v7, v7 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v23, v23 +; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v23, v23 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v23, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v22, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v33, v22 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v8, v8 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v7, v7, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v8, v8 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v24, v24 +; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v24, v24 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v24, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v23, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v33, v23 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v9, v9 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v8, v8, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v9, v9 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v25, v25 +; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v25, v25 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v25, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v24, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v33, v24 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v10, v10 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v9, v9, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v10, v10 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v26, v26 +; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v26, v26 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v26, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v25, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v33, v25 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v11, v11 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v10, v10, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v11, v11 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v27, v27 +; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v27, v27 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v27, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v26, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v33, v26 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v12, v12 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v11, v11, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v12, v12 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v28, v28 +; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v28, v28 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v28, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v27, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v33, v27 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v13, v13 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v12, v12, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v13, v13 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v29, v29 +; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v29, v29 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v29, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v28, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v33, v28 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v14, v14 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v13, v13, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v14, v14 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v30, v30 +; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v30, v30 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v30, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v33, v29 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v15, v15 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v14, v14, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v29, v32 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v15, v15 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8-GISEL-NEXT: v_max_f16_e32 v30, v32, v32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_max_f16_sdwa v32, v32, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v34, v17 +; GFX8-GISEL-NEXT: v_max_f16_e32 v30, v33, v30 +; GFX8-GISEL-NEXT: v_max_f16_sdwa v15, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v31, v0 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v16, v1 ; GFX8-GISEL-NEXT: v_or_b32_e32 v2, v17, v2 @@ -6071,12 +6076,7 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v12, v27, v12 ; GFX8-GISEL-NEXT: v_or_b32_e32 v13, v28, v13 ; GFX8-GISEL-NEXT: v_or_b32_e32 v14, v29, v14 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v30, v30 -; GFX8-GISEL-NEXT: v_max_f16_sdwa v30, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v32, v33 -; GFX8-GISEL-NEXT: v_max_f16_sdwa v15, v15, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_or_b32_e32 v15, v32, v15 +; GFX8-GISEL-NEXT: v_or_b32_e32 v15, v30, v15 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_maximumnum_v32f16: @@ -6086,6 +6086,7 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX900-SDAG-NEXT: v_pk_max_f16 v0, v0, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v17, v17 +; GFX900-SDAG-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX900-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-SDAG-NEXT: v_pk_max_f16 v1, v1, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v18, v18 @@ -6093,44 +6094,43 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-SDAG-NEXT: v_pk_max_f16 v2, v2, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v19, v19 ; GFX900-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX900-SDAG-NEXT: v_pk_max_f16 v3, v3, v16 -; GFX900-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-SDAG-NEXT: v_pk_max_f16 v17, v20, v20 +; GFX900-SDAG-NEXT: v_pk_max_f16 v18, v20, v20 ; GFX900-SDAG-NEXT: v_pk_max_f16 v4, v4, v4 -; GFX900-SDAG-NEXT: v_pk_max_f16 v18, v21, v21 +; GFX900-SDAG-NEXT: v_pk_max_f16 v19, v21, v21 ; GFX900-SDAG-NEXT: v_pk_max_f16 v5, v5, v5 -; GFX900-SDAG-NEXT: v_pk_max_f16 v19, v22, v22 +; GFX900-SDAG-NEXT: v_pk_max_f16 v20, v22, v22 ; GFX900-SDAG-NEXT: v_pk_max_f16 v6, v6, v6 -; GFX900-SDAG-NEXT: v_pk_max_f16 v20, v23, v23 +; GFX900-SDAG-NEXT: v_pk_max_f16 v21, v23, v23 ; GFX900-SDAG-NEXT: v_pk_max_f16 v7, v7, v7 -; GFX900-SDAG-NEXT: v_pk_max_f16 v21, v24, v24 +; GFX900-SDAG-NEXT: v_pk_max_f16 v22, v24, v24 ; GFX900-SDAG-NEXT: v_pk_max_f16 v8, v8, v8 -; GFX900-SDAG-NEXT: v_pk_max_f16 v22, v25, v25 +; GFX900-SDAG-NEXT: v_pk_max_f16 v23, v25, v25 ; GFX900-SDAG-NEXT: v_pk_max_f16 v9, v9, v9 -; GFX900-SDAG-NEXT: v_pk_max_f16 v23, v26, v26 +; GFX900-SDAG-NEXT: v_pk_max_f16 v24, v26, v26 ; GFX900-SDAG-NEXT: v_pk_max_f16 v10, v10, v10 -; GFX900-SDAG-NEXT: v_pk_max_f16 v24, v27, v27 +; GFX900-SDAG-NEXT: v_pk_max_f16 v25, v27, v27 ; GFX900-SDAG-NEXT: v_pk_max_f16 v11, v11, v11 -; GFX900-SDAG-NEXT: v_pk_max_f16 v25, v28, v28 +; GFX900-SDAG-NEXT: v_pk_max_f16 v26, v28, v28 ; GFX900-SDAG-NEXT: v_pk_max_f16 v12, v12, v12 -; GFX900-SDAG-NEXT: v_pk_max_f16 v26, v29, v29 +; GFX900-SDAG-NEXT: v_pk_max_f16 v27, v29, v29 ; GFX900-SDAG-NEXT: v_pk_max_f16 v13, v13, v13 -; GFX900-SDAG-NEXT: v_pk_max_f16 v27, v30, v30 +; GFX900-SDAG-NEXT: v_pk_max_f16 v28, v30, v30 ; GFX900-SDAG-NEXT: v_pk_max_f16 v14, v14, v14 ; GFX900-SDAG-NEXT: v_pk_max_f16 v15, v15, v15 -; GFX900-SDAG-NEXT: v_pk_max_f16 v4, v4, v17 -; GFX900-SDAG-NEXT: v_pk_max_f16 v5, v5, v18 -; GFX900-SDAG-NEXT: v_pk_max_f16 v6, v6, v19 -; GFX900-SDAG-NEXT: v_pk_max_f16 v7, v7, v20 -; GFX900-SDAG-NEXT: v_pk_max_f16 v8, v8, v21 -; GFX900-SDAG-NEXT: v_pk_max_f16 v9, v9, v22 -; GFX900-SDAG-NEXT: v_pk_max_f16 v10, v10, v23 -; GFX900-SDAG-NEXT: v_pk_max_f16 v11, v11, v24 -; GFX900-SDAG-NEXT: v_pk_max_f16 v12, v12, v25 -; GFX900-SDAG-NEXT: v_pk_max_f16 v13, v13, v26 -; GFX900-SDAG-NEXT: v_pk_max_f16 v14, v14, v27 +; GFX900-SDAG-NEXT: v_pk_max_f16 v3, v3, v16 +; GFX900-SDAG-NEXT: v_pk_max_f16 v4, v4, v18 +; GFX900-SDAG-NEXT: v_pk_max_f16 v5, v5, v19 +; GFX900-SDAG-NEXT: v_pk_max_f16 v6, v6, v20 +; GFX900-SDAG-NEXT: v_pk_max_f16 v7, v7, v21 +; GFX900-SDAG-NEXT: v_pk_max_f16 v8, v8, v22 +; GFX900-SDAG-NEXT: v_pk_max_f16 v9, v9, v23 +; GFX900-SDAG-NEXT: v_pk_max_f16 v10, v10, v24 +; GFX900-SDAG-NEXT: v_pk_max_f16 v11, v11, v25 +; GFX900-SDAG-NEXT: v_pk_max_f16 v12, v12, v26 +; GFX900-SDAG-NEXT: v_pk_max_f16 v13, v13, v27 +; GFX900-SDAG-NEXT: v_pk_max_f16 v14, v14, v28 ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v16, v16 +; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v17, v17 ; GFX900-SDAG-NEXT: v_pk_max_f16 v15, v15, v16 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6140,52 +6140,52 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v16, v16 ; GFX900-GISEL-NEXT: v_pk_max_f16 v0, v0, v16 -; GFX900-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v17, v17 +; GFX900-GISEL-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-GISEL-NEXT: v_pk_max_f16 v1, v1, v16 ; GFX900-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v18, v18 ; GFX900-GISEL-NEXT: v_pk_max_f16 v2, v2, v16 ; GFX900-GISEL-NEXT: v_pk_max_f16 v3, v3, v3 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v19, v19 -; GFX900-GISEL-NEXT: v_pk_max_f16 v3, v3, v16 -; GFX900-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX900-GISEL-NEXT: v_pk_max_f16 v4, v4, v4 -; GFX900-GISEL-NEXT: v_pk_max_f16 v17, v20, v20 +; GFX900-GISEL-NEXT: v_pk_max_f16 v18, v20, v20 ; GFX900-GISEL-NEXT: v_pk_max_f16 v5, v5, v5 -; GFX900-GISEL-NEXT: v_pk_max_f16 v18, v21, v21 +; GFX900-GISEL-NEXT: v_pk_max_f16 v19, v21, v21 ; GFX900-GISEL-NEXT: v_pk_max_f16 v6, v6, v6 -; GFX900-GISEL-NEXT: v_pk_max_f16 v19, v22, v22 +; GFX900-GISEL-NEXT: v_pk_max_f16 v20, v22, v22 ; GFX900-GISEL-NEXT: v_pk_max_f16 v7, v7, v7 -; GFX900-GISEL-NEXT: v_pk_max_f16 v20, v23, v23 +; GFX900-GISEL-NEXT: v_pk_max_f16 v21, v23, v23 ; GFX900-GISEL-NEXT: v_pk_max_f16 v8, v8, v8 -; GFX900-GISEL-NEXT: v_pk_max_f16 v21, v24, v24 +; GFX900-GISEL-NEXT: v_pk_max_f16 v22, v24, v24 ; GFX900-GISEL-NEXT: v_pk_max_f16 v9, v9, v9 -; GFX900-GISEL-NEXT: v_pk_max_f16 v22, v25, v25 +; GFX900-GISEL-NEXT: v_pk_max_f16 v23, v25, v25 ; GFX900-GISEL-NEXT: v_pk_max_f16 v10, v10, v10 -; GFX900-GISEL-NEXT: v_pk_max_f16 v23, v26, v26 +; GFX900-GISEL-NEXT: v_pk_max_f16 v24, v26, v26 ; GFX900-GISEL-NEXT: v_pk_max_f16 v11, v11, v11 -; GFX900-GISEL-NEXT: v_pk_max_f16 v24, v27, v27 +; GFX900-GISEL-NEXT: v_pk_max_f16 v25, v27, v27 ; GFX900-GISEL-NEXT: v_pk_max_f16 v12, v12, v12 -; GFX900-GISEL-NEXT: v_pk_max_f16 v25, v28, v28 +; GFX900-GISEL-NEXT: v_pk_max_f16 v26, v28, v28 ; GFX900-GISEL-NEXT: v_pk_max_f16 v13, v13, v13 -; GFX900-GISEL-NEXT: v_pk_max_f16 v26, v29, v29 +; GFX900-GISEL-NEXT: v_pk_max_f16 v27, v29, v29 ; GFX900-GISEL-NEXT: v_pk_max_f16 v14, v14, v14 -; GFX900-GISEL-NEXT: v_pk_max_f16 v27, v30, v30 +; GFX900-GISEL-NEXT: v_pk_max_f16 v28, v30, v30 ; GFX900-GISEL-NEXT: v_pk_max_f16 v15, v15, v15 -; GFX900-GISEL-NEXT: v_pk_max_f16 v4, v4, v17 -; GFX900-GISEL-NEXT: v_pk_max_f16 v5, v5, v18 -; GFX900-GISEL-NEXT: v_pk_max_f16 v6, v6, v19 -; GFX900-GISEL-NEXT: v_pk_max_f16 v7, v7, v20 -; GFX900-GISEL-NEXT: v_pk_max_f16 v8, v8, v21 -; GFX900-GISEL-NEXT: v_pk_max_f16 v9, v9, v22 -; GFX900-GISEL-NEXT: v_pk_max_f16 v10, v10, v23 -; GFX900-GISEL-NEXT: v_pk_max_f16 v11, v11, v24 -; GFX900-GISEL-NEXT: v_pk_max_f16 v12, v12, v25 -; GFX900-GISEL-NEXT: v_pk_max_f16 v13, v13, v26 -; GFX900-GISEL-NEXT: v_pk_max_f16 v14, v14, v27 +; GFX900-GISEL-NEXT: v_pk_max_f16 v3, v3, v16 +; GFX900-GISEL-NEXT: v_pk_max_f16 v4, v4, v18 +; GFX900-GISEL-NEXT: v_pk_max_f16 v5, v5, v19 +; GFX900-GISEL-NEXT: v_pk_max_f16 v6, v6, v20 +; GFX900-GISEL-NEXT: v_pk_max_f16 v7, v7, v21 +; GFX900-GISEL-NEXT: v_pk_max_f16 v8, v8, v22 +; GFX900-GISEL-NEXT: v_pk_max_f16 v9, v9, v23 +; GFX900-GISEL-NEXT: v_pk_max_f16 v10, v10, v24 +; GFX900-GISEL-NEXT: v_pk_max_f16 v11, v11, v25 +; GFX900-GISEL-NEXT: v_pk_max_f16 v12, v12, v26 +; GFX900-GISEL-NEXT: v_pk_max_f16 v13, v13, v27 +; GFX900-GISEL-NEXT: v_pk_max_f16 v14, v14, v28 ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v16, v16 +; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v17, v17 ; GFX900-GISEL-NEXT: v_pk_max_f16 v15, v15, v16 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4c0ab91b7d622..ad73a905e4fb3 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -82,14 +82,9 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 @@ -100,341 +95,353 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] ; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 ; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[24:25] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[31:34], v[24:25] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[39:42], v[24:25] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[24:25] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:48 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:16 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:208 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:192 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:188 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:184 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:180 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:176 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:160 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:152 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v69 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:128 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:96 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:64 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 @@ -447,258 +454,243 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:249 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 ; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:241 ; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:237 ; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:229 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:221 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:215 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v118 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v119 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v40 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v41 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v42 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:149 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:31 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:27 ; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:23 ; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:19 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 ; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 ; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 ; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 ; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 @@ -714,15 +706,10 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -838,14 +825,9 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB1_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 @@ -856,339 +838,351 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[35:38], v[24:25], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[31:34], v[24:25], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[39:42], v[24:25], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[24:25], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:48 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:16 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:188 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:184 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:180 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:152 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v69 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 @@ -1201,258 +1195,243 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:249 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 ; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:241 ; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:237 ; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:229 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v30 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v25 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:221 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:215 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v118, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v21 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v119, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v40, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v50 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v41, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v39 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v42, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v52 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:149 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:67 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:31 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:27 ; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:23 ; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:19 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 ; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 ; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 ; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 @@ -1468,15 +1447,10 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -1592,15 +1566,28 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -1618,473 +1605,487 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v98 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v99 offset:254 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:252 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v97 offset:246 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v103 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v101 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v100 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:253 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:247 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:245 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:243 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:241 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v102 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v103 offset:238 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:236 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v101 offset:230 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v100 offset:226 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:235 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:233 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:239 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:237 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:231 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:229 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:227 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:225 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v43, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v44, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v45, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v46, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:219 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:217 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:223 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:221 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:215 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:213 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:211 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:209 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v47, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v56, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v57, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v58, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v59, 24, v64 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:203 +; ALIGNED-NEXT: flat_store_byte v[84:85], v42 offset:201 +; ALIGNED-NEXT: flat_store_byte v[84:85], v43 offset:207 +; ALIGNED-NEXT: flat_store_byte v[84:85], v44 offset:205 +; ALIGNED-NEXT: flat_store_byte v[84:85], v45 offset:199 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:197 +; ALIGNED-NEXT: flat_store_byte v[84:85], v46 offset:195 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:193 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[84:85], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[84:85], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v60, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v47 offset:187 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:185 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:191 +; ALIGNED-NEXT: flat_store_byte v[84:85], v56 offset:189 +; ALIGNED-NEXT: flat_store_byte v[84:85], v57 offset:183 +; ALIGNED-NEXT: flat_store_byte v[84:85], v58 offset:181 +; ALIGNED-NEXT: flat_store_byte v[84:85], v59 offset:179 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:177 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:175 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v48 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:171 +; ALIGNED-NEXT: flat_store_byte v[84:85], v40 offset:169 +; ALIGNED-NEXT: flat_store_byte v[84:85], v119 offset:173 +; ALIGNED-NEXT: flat_store_byte v[84:85], v41 offset:163 +; ALIGNED-NEXT: flat_store_byte v[84:85], v60 offset:161 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:167 +; ALIGNED-NEXT: flat_store_byte v[84:85], v42 offset:165 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[84:85], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[84:85], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[84:85], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[84:85], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v36 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:155 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:153 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:159 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:157 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:151 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:149 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:147 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[84:85], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[84:85], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[84:85], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:139 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:137 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:143 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:141 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:135 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:133 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:131 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[84:85], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[84:85], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:123 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:121 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:127 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:125 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:119 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:117 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:115 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[84:85], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[84:85], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[84:85], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v14 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v6 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v67 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:97 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:91 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[84:85], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[84:85], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[84:85], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[84:85], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[84:85], v66 offset:89 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:95 +; ALIGNED-NEXT: flat_store_byte v[84:85], v49 offset:93 +; ALIGNED-NEXT: flat_store_byte v[84:85], v50 offset:87 +; ALIGNED-NEXT: flat_store_byte v[84:85], v51 offset:85 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:83 +; ALIGNED-NEXT: flat_store_byte v[84:85], v35 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[84:85], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[84:85], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[84:85], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[84:85], v48 offset:75 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:73 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:79 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:77 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:71 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:69 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:67 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[84:85], v37 offset:59 +; ALIGNED-NEXT: flat_store_byte v[84:85], v38 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[84:85], v39 offset:63 +; ALIGNED-NEXT: flat_store_byte v[84:85], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:61 +; ALIGNED-NEXT: flat_store_byte v[84:85], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:49 +; ALIGNED-NEXT: flat_store_byte v[84:85], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:43 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:47 +; ALIGNED-NEXT: flat_store_byte v[84:85], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[84:85], v33 offset:45 +; ALIGNED-NEXT: flat_store_byte v[84:85], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[84:85], v34 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:37 +; ALIGNED-NEXT: flat_store_byte v[84:85], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:33 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:27 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:31 +; ALIGNED-NEXT: flat_store_byte v[84:85], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:29 +; ALIGNED-NEXT: flat_store_byte v[84:85], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[84:85], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[84:85], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[84:85], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[84:85], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:11 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:15 +; ALIGNED-NEXT: flat_store_byte v[84:85], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:13 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[84:85], v67 offset:5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_clause 0xc +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; UNROLL3-LABEL: memcpy_p0_p4_sz2048: @@ -2391,60 +2392,59 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_clause 0x33 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] @@ -2452,80 +2452,81 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: s_clause 0xb +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill @@ -2917,134 +2918,134 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 @@ -3584,13 +3585,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 ; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 ; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 @@ -3599,14 +3600,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128 ; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 ; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 @@ -3615,30 +3616,30 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:96 ; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 @@ -3657,20 +3658,18 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:96 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 @@ -3741,7 +3740,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3b ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 @@ -3763,14 +3762,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 @@ -3779,76 +3778,78 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: s_waitcnt vmcnt(59) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: s_waitcnt vmcnt(53) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: s_waitcnt vmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v24, 8, v23 ; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 ; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v22, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -3856,27 +3857,27 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: s_waitcnt vmcnt(18) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v48, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v49 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 @@ -3884,15 +3885,15 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -3900,31 +3901,29 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v83, 8, v81 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill @@ -3936,16 +3935,16 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill @@ -3954,33 +3953,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v82, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4227,7 +4224,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x5 @@ -4241,7 +4238,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v125 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -4254,25 +4251,25 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v120 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153 ; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v91 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 @@ -4282,7 +4279,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 ; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164 ; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166 ; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 8, v89 @@ -4290,7 +4287,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v79 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -4318,7 +4315,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:176 ; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177 ; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179 @@ -4327,7 +4324,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182 ; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -4492,84 +4489,84 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v123, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27 ; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v12, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v60, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v94, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v101, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v45, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v84, v46, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 16, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v5, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v60, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v60, 16, v46 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v45, v45, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v60, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v90, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v104, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v46, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v60, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v90, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v105, 8, v93 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v46 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v111, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v110, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v110, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v46 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v92, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v93, 8, v105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v94, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v95, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v46 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:228 @@ -4578,7 +4575,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v45, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v46, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4603,8 +4600,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 @@ -4711,7 +4708,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:181 ; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:176 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 @@ -4737,7 +4734,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:161 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload @@ -4753,16 +4750,16 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:153 ; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:159 ; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:157 ; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:158 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:152 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:152 ; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:146 ; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 ; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:151 @@ -4772,7 +4769,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload @@ -5021,22 +5018,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload @@ -5051,7 +5048,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload @@ -5236,7 +5233,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -5263,14 +5260,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:11 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:11 ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:9 +; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:9 ; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:14 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload @@ -5279,7 +5276,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload @@ -5288,7 +5285,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload @@ -5552,14 +5549,15 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -5568,348 +5566,352 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v28, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v29, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[28:29] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[28:29] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[28:29] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[28:29] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[28:29] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[28:29] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[28:29] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[28:29] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[31:34], v[28:29] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[24:27], v[28:29] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[28:29] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[37:40], v[28:29] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[41:44], v[28:29] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[28:29] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[56:59], v[28:29] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[28:29] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:80 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:208 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:192 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:176 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:128 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:96 ; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 ; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 ; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:80 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:64 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 @@ -5922,242 +5924,238 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 ; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 ; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v65 ; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v64 ; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v20 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:149 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:69 ; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:31 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:27 ; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:23 ; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:19 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 @@ -6206,339 +6204,352 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 ; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 ; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[24:25] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[24:25] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[31:34], v[24:25] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[24:25] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:208 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:192 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:176 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:160 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:128 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:96 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:80 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 @@ -6551,242 +6562,229 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 ; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:249 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 ; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:241 ; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:237 ; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:229 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:221 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:223 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:215 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v118 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v119 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v40 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v41 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v42 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:149 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v116 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v117 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:31 ; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:27 ; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:23 ; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:19 ; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 @@ -6819,15 +6817,16 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x8 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -7055,14 +7054,15 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-LABEL: memmove_p1_p1_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -7071,346 +7071,350 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v28, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v29, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[20:21], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[20:21], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[20:21], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[28:29], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[28:29], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[28:29], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[28:29], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[31:34], v[28:29], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[28:29], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[37:40], v[28:29], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[41:44], v[28:29], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[28:29], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[56:59], v[28:29], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[28:29], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:80 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:172 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:168 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:88 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:84 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:80 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:96 ; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 ; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 ; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:76 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:72 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:68 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 @@ -7423,242 +7427,238 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 ; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 ; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v65 ; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v64 ; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v51 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v50 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v30 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v21 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v20 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v25 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v31 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v39 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v52 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v69 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:149 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:69 ; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:31 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:27 ; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:23 ; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:19 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 @@ -7707,337 +7707,350 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[24:25], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[35:38], v[24:25], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[31:34], v[24:25], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[24:25], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:132 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:128 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 @@ -8050,242 +8063,229 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v80 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v86 ; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:249 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v82 ; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:241 ; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v70 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:237 ; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:229 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v25 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:221 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:223 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:215 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v118, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v21 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v119, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v40, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v51 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v50 +; ALIGNED-NEXT: global_store_byte v[16:17], v41, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v64 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v42, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:149 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:67 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v116, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v117, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:31 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:27 ; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:23 ; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:19 ; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 @@ -8318,15 +8318,16 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 ; ALIGNED-NEXT: .LBB6_6: ; %Flow8 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x8 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8554,6 +8555,17 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-LABEL: memmove_p0_p4_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -8564,12 +8576,12 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v80, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v81, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 ; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 @@ -8586,470 +8598,470 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v84 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v85 offset:254 +; ALIGNED-NEXT: flat_store_byte v[80:81], v85 offset:252 +; ALIGNED-NEXT: flat_store_byte v[80:81], v84 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v83 offset:246 +; ALIGNED-NEXT: flat_store_byte v[80:81], v83 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v82 offset:242 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[80:81], v86 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[80:81], v87 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[80:81], v112 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[80:81], v113 offset:253 +; ALIGNED-NEXT: flat_store_byte v[80:81], v114 offset:247 +; ALIGNED-NEXT: flat_store_byte v[80:81], v115 offset:245 +; ALIGNED-NEXT: flat_store_byte v[80:81], v116 offset:243 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:241 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v98 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v99 offset:238 +; ALIGNED-NEXT: flat_store_byte v[80:81], v99 offset:236 +; ALIGNED-NEXT: flat_store_byte v[80:81], v98 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v97 offset:230 +; ALIGNED-NEXT: flat_store_byte v[80:81], v97 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v96 offset:226 +; ALIGNED-NEXT: flat_store_byte v[80:81], v96 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v100 +; ALIGNED-NEXT: flat_store_byte v[80:81], v83 offset:235 +; ALIGNED-NEXT: flat_store_byte v[80:81], v84 offset:233 +; ALIGNED-NEXT: flat_store_byte v[80:81], v85 offset:239 +; ALIGNED-NEXT: flat_store_byte v[80:81], v117 offset:237 +; ALIGNED-NEXT: flat_store_byte v[80:81], v86 offset:231 +; ALIGNED-NEXT: flat_store_byte v[80:81], v87 offset:229 +; ALIGNED-NEXT: flat_store_byte v[80:81], v112 offset:227 +; ALIGNED-NEXT: flat_store_byte v[80:81], v96 offset:225 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v102 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v103 offset:222 +; ALIGNED-NEXT: flat_store_byte v[80:81], v103 offset:220 +; ALIGNED-NEXT: flat_store_byte v[80:81], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v101 offset:214 +; ALIGNED-NEXT: flat_store_byte v[80:81], v101 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v100 offset:210 +; ALIGNED-NEXT: flat_store_byte v[80:81], v100 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[80:81], v113 offset:219 +; ALIGNED-NEXT: flat_store_byte v[80:81], v114 offset:217 +; ALIGNED-NEXT: flat_store_byte v[80:81], v115 offset:223 +; ALIGNED-NEXT: flat_store_byte v[80:81], v116 offset:221 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:215 +; ALIGNED-NEXT: flat_store_byte v[80:81], v97 offset:213 +; ALIGNED-NEXT: flat_store_byte v[80:81], v98 offset:211 +; ALIGNED-NEXT: flat_store_byte v[80:81], v100 offset:209 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[80:81], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[80:81], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[80:81], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[80:81], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[80:81], v99 offset:203 +; ALIGNED-NEXT: flat_store_byte v[80:81], v83 offset:201 +; ALIGNED-NEXT: flat_store_byte v[80:81], v84 offset:207 +; ALIGNED-NEXT: flat_store_byte v[80:81], v85 offset:205 +; ALIGNED-NEXT: flat_store_byte v[80:81], v117 offset:199 +; ALIGNED-NEXT: flat_store_byte v[80:81], v86 offset:197 +; ALIGNED-NEXT: flat_store_byte v[80:81], v87 offset:195 +; ALIGNED-NEXT: flat_store_byte v[80:81], v68 offset:193 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[80:81], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[80:81], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[80:81], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[80:81], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[80:81], v112 offset:187 +; ALIGNED-NEXT: flat_store_byte v[80:81], v96 offset:185 +; ALIGNED-NEXT: flat_store_byte v[80:81], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[80:81], v102 offset:189 +; ALIGNED-NEXT: flat_store_byte v[80:81], v103 offset:183 +; ALIGNED-NEXT: flat_store_byte v[80:81], v113 offset:181 +; ALIGNED-NEXT: flat_store_byte v[80:81], v114 offset:179 +; ALIGNED-NEXT: flat_store_byte v[80:81], v64 offset:177 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[80:81], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[80:81], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[80:81], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[80:81], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[80:81], v115 offset:171 +; ALIGNED-NEXT: flat_store_byte v[80:81], v116 offset:169 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:175 +; ALIGNED-NEXT: flat_store_byte v[80:81], v97 offset:173 +; ALIGNED-NEXT: flat_store_byte v[80:81], v98 offset:163 +; ALIGNED-NEXT: flat_store_byte v[80:81], v69 offset:161 +; ALIGNED-NEXT: flat_store_byte v[80:81], v70 offset:167 +; ALIGNED-NEXT: flat_store_byte v[80:81], v53 offset:165 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[80:81], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[80:81], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[80:81], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[80:81], v48 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[80:81], v71 offset:155 +; ALIGNED-NEXT: flat_store_byte v[80:81], v100 offset:153 +; ALIGNED-NEXT: flat_store_byte v[80:81], v99 offset:159 +; ALIGNED-NEXT: flat_store_byte v[80:81], v83 offset:157 +; ALIGNED-NEXT: flat_store_byte v[80:81], v84 offset:151 +; ALIGNED-NEXT: flat_store_byte v[80:81], v85 offset:149 +; ALIGNED-NEXT: flat_store_byte v[80:81], v117 offset:147 +; ALIGNED-NEXT: flat_store_byte v[80:81], v48 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[80:81], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[80:81], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[80:81], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[80:81], v36 offset:128 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[80:81], v86 offset:139 +; ALIGNED-NEXT: flat_store_byte v[80:81], v87 offset:137 +; ALIGNED-NEXT: flat_store_byte v[80:81], v65 offset:143 +; ALIGNED-NEXT: flat_store_byte v[80:81], v66 offset:141 +; ALIGNED-NEXT: flat_store_byte v[80:81], v67 offset:135 +; ALIGNED-NEXT: flat_store_byte v[80:81], v68 offset:133 +; ALIGNED-NEXT: flat_store_byte v[80:81], v112 offset:131 +; ALIGNED-NEXT: flat_store_byte v[80:81], v36 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[80:81], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[80:81], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[80:81], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[80:81], v32 offset:112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[80:81], v96 offset:123 +; ALIGNED-NEXT: flat_store_byte v[80:81], v101 offset:121 +; ALIGNED-NEXT: flat_store_byte v[80:81], v102 offset:127 +; ALIGNED-NEXT: flat_store_byte v[80:81], v103 offset:125 +; ALIGNED-NEXT: flat_store_byte v[80:81], v113 offset:119 +; ALIGNED-NEXT: flat_store_byte v[80:81], v114 offset:117 +; ALIGNED-NEXT: flat_store_byte v[80:81], v54 offset:115 +; ALIGNED-NEXT: flat_store_byte v[80:81], v32 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[80:81], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[80:81], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[80:81], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[80:81], v28 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v14 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[80:81], v55 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[80:81], v64 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v27 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[80:81], v118 offset:111 +; ALIGNED-NEXT: flat_store_byte v[80:81], v115 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[80:81], v116 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[80:81], v97 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[80:81], v28 offset:97 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[80:81], v98 offset:91 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[80:81], v52 offset:89 +; ALIGNED-NEXT: flat_store_byte v[80:81], v69 offset:95 +; ALIGNED-NEXT: flat_store_byte v[80:81], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[80:81], v49 offset:93 +; ALIGNED-NEXT: flat_store_byte v[80:81], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte v[80:81], v50 offset:87 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[80:81], v51 offset:85 +; ALIGNED-NEXT: flat_store_byte v[80:81], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte v[80:81], v53 offset:83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[80:81], v70 offset:81 +; ALIGNED-NEXT: flat_store_byte v[80:81], v24 offset:80 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte v[80:81], v71 offset:75 +; ALIGNED-NEXT: flat_store_byte v[80:81], v99 offset:73 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[80:81], v83 offset:79 +; ALIGNED-NEXT: flat_store_byte v[80:81], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[80:81], v84 offset:77 +; ALIGNED-NEXT: flat_store_byte v[80:81], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte v[80:81], v85 offset:71 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[80:81], v100 offset:69 +; ALIGNED-NEXT: flat_store_byte v[80:81], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte v[80:81], v37 offset:67 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[80:81], v38 offset:65 +; ALIGNED-NEXT: flat_store_byte v[80:81], v20 offset:64 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[80:81], v39 offset:59 +; ALIGNED-NEXT: flat_store_byte v[80:81], v48 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[80:81], v86 offset:63 +; ALIGNED-NEXT: flat_store_byte v[80:81], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[80:81], v87 offset:61 +; ALIGNED-NEXT: flat_store_byte v[80:81], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[80:81], v65 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[80:81], v66 offset:53 +; ALIGNED-NEXT: flat_store_byte v[80:81], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[80:81], v67 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[80:81], v68 offset:49 +; ALIGNED-NEXT: flat_store_byte v[80:81], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[80:81], v112 offset:43 +; ALIGNED-NEXT: flat_store_byte v[80:81], v33 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[80:81], v34 offset:47 +; ALIGNED-NEXT: flat_store_byte v[80:81], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[80:81], v35 offset:45 +; ALIGNED-NEXT: flat_store_byte v[80:81], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[80:81], v36 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[80:81], v96 offset:37 +; ALIGNED-NEXT: flat_store_byte v[80:81], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[80:81], v101 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[80:81], v102 offset:33 +; ALIGNED-NEXT: flat_store_byte v[80:81], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[80:81], v103 offset:27 +; ALIGNED-NEXT: flat_store_byte v[80:81], v113 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[80:81], v114 offset:31 +; ALIGNED-NEXT: flat_store_byte v[80:81], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[80:81], v54 offset:29 +; ALIGNED-NEXT: flat_store_byte v[80:81], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[80:81], v29 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[80:81], v30 offset:21 +; ALIGNED-NEXT: flat_store_byte v[80:81], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[80:81], v31 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[80:81], v32 offset:17 +; ALIGNED-NEXT: flat_store_byte v[80:81], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[80:81], v55 offset:11 +; ALIGNED-NEXT: flat_store_byte v[80:81], v64 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[80:81], v117 offset:15 +; ALIGNED-NEXT: flat_store_byte v[80:81], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[80:81], v115 offset:13 +; ALIGNED-NEXT: flat_store_byte v[80:81], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[80:81], v116 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[80:81], v82 offset:5 +; ALIGNED-NEXT: flat_store_byte v[80:81], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[80:81], v97 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[80:81], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[80:81], v28 offset:1 +; ALIGNED-NEXT: flat_store_byte v[80:81], v4 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: ; %Flow6 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -9062,11 +9074,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -9084,473 +9096,485 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v98 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v99 offset:254 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:252 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v97 offset:246 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v103 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v101 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v100 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:253 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:247 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:245 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:243 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:241 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v102 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v103 offset:238 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:236 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v101 offset:230 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v100 offset:226 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:235 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:233 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:239 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:237 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:231 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:229 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:227 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:225 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v43, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v44, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:219 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:217 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:223 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:221 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:215 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:213 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:211 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:209 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v45, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v46, 8, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v47, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v56, 8, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v57, 24, v64 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:203 +; ALIGNED-NEXT: flat_store_byte v[84:85], v42 offset:201 +; ALIGNED-NEXT: flat_store_byte v[84:85], v43 offset:207 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:205 +; ALIGNED-NEXT: flat_store_byte v[84:85], v44 offset:199 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:197 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:195 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:193 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[84:85], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[84:85], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v58, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v45 offset:187 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:185 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:191 +; ALIGNED-NEXT: flat_store_byte v[84:85], v46 offset:189 +; ALIGNED-NEXT: flat_store_byte v[84:85], v47 offset:183 +; ALIGNED-NEXT: flat_store_byte v[84:85], v56 offset:181 +; ALIGNED-NEXT: flat_store_byte v[84:85], v57 offset:179 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:177 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:175 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v51 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v48 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:171 +; ALIGNED-NEXT: flat_store_byte v[84:85], v40 offset:169 +; ALIGNED-NEXT: flat_store_byte v[84:85], v119 offset:173 +; ALIGNED-NEXT: flat_store_byte v[84:85], v41 offset:163 +; ALIGNED-NEXT: flat_store_byte v[84:85], v58 offset:161 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:167 +; ALIGNED-NEXT: flat_store_byte v[84:85], v42 offset:165 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[84:85], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[84:85], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[84:85], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[84:85], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v36 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:155 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:153 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:159 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:157 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:151 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:149 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:147 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[84:85], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[84:85], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[84:85], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:139 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:137 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:143 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:141 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:135 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:133 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:131 +; ALIGNED-NEXT: flat_store_byte v[84:85], v118 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[84:85], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[84:85], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:123 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:121 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:127 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:125 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:119 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:117 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:115 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[84:85], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[84:85], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[84:85], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v14 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v6 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v66 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:97 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:91 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[84:85], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[84:85], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[84:85], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[84:85], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[84:85], v67 offset:89 +; ALIGNED-NEXT: flat_store_byte v[84:85], v69 offset:95 +; ALIGNED-NEXT: flat_store_byte v[84:85], v49 offset:93 +; ALIGNED-NEXT: flat_store_byte v[84:85], v50 offset:87 +; ALIGNED-NEXT: flat_store_byte v[84:85], v51 offset:85 +; ALIGNED-NEXT: flat_store_byte v[84:85], v97 offset:83 +; ALIGNED-NEXT: flat_store_byte v[84:85], v35 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[84:85], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[84:85], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[84:85], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[84:85], v48 offset:75 +; ALIGNED-NEXT: flat_store_byte v[84:85], v96 offset:73 +; ALIGNED-NEXT: flat_store_byte v[84:85], v98 offset:79 +; ALIGNED-NEXT: flat_store_byte v[84:85], v100 offset:77 +; ALIGNED-NEXT: flat_store_byte v[84:85], v102 offset:71 +; ALIGNED-NEXT: flat_store_byte v[84:85], v115 offset:69 +; ALIGNED-NEXT: flat_store_byte v[84:85], v116 offset:67 +; ALIGNED-NEXT: flat_store_byte v[84:85], v32 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[84:85], v37 offset:59 +; ALIGNED-NEXT: flat_store_byte v[84:85], v38 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[84:85], v39 offset:63 +; ALIGNED-NEXT: flat_store_byte v[84:85], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:61 +; ALIGNED-NEXT: flat_store_byte v[84:85], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[84:85], v83 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:53 +; ALIGNED-NEXT: flat_store_byte v[84:85], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[84:85], v114 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[84:85], v117 offset:49 +; ALIGNED-NEXT: flat_store_byte v[84:85], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[84:85], v101 offset:43 +; ALIGNED-NEXT: flat_store_byte v[84:85], v103 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[84:85], v112 offset:47 +; ALIGNED-NEXT: flat_store_byte v[84:85], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[84:85], v33 offset:45 +; ALIGNED-NEXT: flat_store_byte v[84:85], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[84:85], v34 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[84:85], v113 offset:37 +; ALIGNED-NEXT: flat_store_byte v[84:85], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[84:85], v70 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[84:85], v82 offset:33 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[84:85], v71 offset:27 +; ALIGNED-NEXT: flat_store_byte v[84:85], v80 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v81 offset:31 +; ALIGNED-NEXT: flat_store_byte v[84:85], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[84:85], v86 offset:29 +; ALIGNED-NEXT: flat_store_byte v[84:85], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[84:85], v87 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[84:85], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[84:85], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[84:85], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[84:85], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v52 offset:11 +; ALIGNED-NEXT: flat_store_byte v[84:85], v54 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v53 offset:15 +; ALIGNED-NEXT: flat_store_byte v[84:85], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[84:85], v55 offset:13 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[84:85], v64 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[84:85], v66 offset:5 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[84:85], v68 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v28 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %Flow7 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_clause 0xa +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; UNROLL3-LABEL: memmove_p0_p4_sz2048: @@ -11341,10 +11365,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:200 ; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 ; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 ; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 @@ -11407,8 +11429,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: s_clause 0xb ; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: s_clause 0xa ; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 ; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 @@ -11421,8 +11443,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x33 ; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 @@ -11474,7 +11496,9 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill @@ -11863,9 +11887,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:200 ; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 ; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 ; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 @@ -11939,7 +11961,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 @@ -11993,7 +12015,9 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 @@ -12563,13 +12587,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 ; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 ; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 @@ -12578,46 +12602,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:96 ; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 @@ -12631,24 +12655,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:96 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 @@ -12726,7 +12748,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 ; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3b ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 @@ -12764,13 +12786,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 @@ -12780,59 +12802,61 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: s_waitcnt vmcnt(59) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: s_waitcnt vmcnt(53) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: s_waitcnt vmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 @@ -12841,27 +12865,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 +; ALIGNED-NEXT: s_waitcnt vmcnt(18) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v39, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 @@ -12871,13 +12895,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -12886,26 +12910,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill @@ -12919,10 +12941,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -12938,34 +12960,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v83, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14004,7 +14024,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload @@ -14016,7 +14036,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload @@ -14073,7 +14093,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload @@ -14085,7 +14105,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload @@ -14094,7 +14114,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload @@ -14296,7 +14316,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: s_clause 0x3b ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 @@ -14318,14 +14338,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 @@ -14334,77 +14354,78 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: s_waitcnt vmcnt(59) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: s_waitcnt vmcnt(53) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: s_waitcnt vmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v24, 8, v23 +; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(33) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v22, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 @@ -14412,27 +14433,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: s_waitcnt vmcnt(18) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v48, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v49 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -14440,15 +14461,15 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -14456,31 +14477,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v83, 8, v81 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14492,16 +14511,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill @@ -14510,33 +14529,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v82, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14741,10 +14758,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 @@ -14756,8 +14771,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -14770,65 +14783,70 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v127 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v121 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v94, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 @@ -15046,18 +15064,30 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v123, v3, 16, v2 +; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v43, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v57, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 @@ -15066,60 +15096,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v84, v43, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v43, v9, 8, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v57, 16, v43 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v43, v5, 8, v6 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v57, v7, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v57, 16, v43 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v43, v43, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v127 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: v_lshl_or_b32 v43, v90, 8, v78 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v92 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v109, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v91, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v92, 8, v104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v93, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v94, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 @@ -15159,7 +15175,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload @@ -15308,18 +15324,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:154 ; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:159 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:157 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:158 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:156 ; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload @@ -15329,10 +15343,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload @@ -15581,22 +15595,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload @@ -15611,7 +15625,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload @@ -15823,28 +15837,30 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:9 ; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:14 +; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 ; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index f7aaa3ec4d0ed..518d9b00f1a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: v_mov_b32_e32 v29, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -21,13 +21,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, s22 +; GFX908-NEXT: v_mov_b32_e32 v1, s23 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, s21 -; GFX908-NEXT: v_mov_b32_e32 v1, s22 -; GFX908-NEXT: v_mov_b32_e32 v2, s23 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v1 ; GFX908-NEXT: v_mov_b32_e32 v0, s24 ; GFX908-NEXT: v_mov_b32_e32 v1, s25 ; GFX908-NEXT: v_mov_b32_e32 v2, s26 @@ -68,7 +66,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v0, s10 ; GFX908-NEXT: v_mov_b32_e32 v1, s11 ; GFX908-NEXT: v_mov_b32_e32 v2, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v4, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, s21 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 @@ -77,7 +76,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v1, s14 ; GFX908-NEXT: v_mov_b32_e32 v2, s15 ; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 @@ -91,57 +91,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a19 +; GFX908-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a4 +; GFX908-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; GFX908-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; GFX908-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; GFX908-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; GFX908-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; GFX908-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; GFX908-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -154,7 +139,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: v_mov_b32_e32 v29, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -164,13 +149,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, s22 +; GFX908-NEXT: v_mov_b32_e32 v1, s23 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, s21 -; GFX908-NEXT: v_mov_b32_e32 v1, s22 -; GFX908-NEXT: v_mov_b32_e32 v2, s23 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v1 ; GFX908-NEXT: v_mov_b32_e32 v0, s24 ; GFX908-NEXT: v_mov_b32_e32 v1, s25 ; GFX908-NEXT: v_mov_b32_e32 v2, s26 @@ -211,7 +194,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v0, s10 ; GFX908-NEXT: v_mov_b32_e32 v1, s11 ; GFX908-NEXT: v_mov_b32_e32 v2, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v4, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, s21 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 @@ -220,7 +204,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v1, s14 ; GFX908-NEXT: v_mov_b32_e32 v2, s15 ; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 @@ -234,57 +219,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a19 +; GFX908-NEXT: global_store_dwordx4 v29, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a4 +; GFX908-NEXT: global_store_dwordx4 v29, v[1:4], s[34:35] offset:112 +; GFX908-NEXT: global_store_dwordx4 v29, v[5:8], s[34:35] offset:64 +; GFX908-NEXT: global_store_dwordx4 v29, v[9:12], s[34:35] offset:80 +; GFX908-NEXT: global_store_dwordx4 v29, v[13:16], s[34:35] offset:32 +; GFX908-NEXT: global_store_dwordx4 v29, v[17:20], s[34:35] offset:48 +; GFX908-NEXT: global_store_dwordx4 v29, v[21:24], s[34:35] +; GFX908-NEXT: global_store_dwordx4 v29, v[25:28], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -351,36 +321,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 @@ -458,36 +427,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 @@ -565,36 +533,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 @@ -694,36 +661,35 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 @@ -760,8 +726,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c ; GFX908-NEXT: v_mov_b32_e32 v6, 1.0 -; GFX908-NEXT: v_mov_b32_e32 v7, 0 ; GFX908-NEXT: s_addc_u32 s53, s53, 0 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 @@ -842,57 +808,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a7 +; GFX908-NEXT: global_store_dwordx4 v32, v[3:6], s[6:7] offset:96 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 ; GFX908-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX908-NEXT: ; %bb.1: ; %bb2 ; GFX908-NEXT: s_add_u32 s8, s4, 48 @@ -981,36 +932,35 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 @@ -1083,36 +1033,35 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index e6d7b14381d7a..a7b50ad9121df 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -89,74 +89,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 -; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 -; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 -; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 -; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 -; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 -; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 -; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 -; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 -; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 -; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 -; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 -; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 -; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 -; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 -; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 -; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 -; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 -; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 -; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 -; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 -; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 -; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a61 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a60 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a59 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a58 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a56 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a55 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a54 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a53 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a52 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a50 +; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a49 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a48 +; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a47 +; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a46 +; GREEDY908-NEXT: v_accvgpr_read_b32 v20, a45 +; GREEDY908-NEXT: v_accvgpr_read_b32 v21, a44 +; GREEDY908-NEXT: v_accvgpr_read_b32 v22, a43 +; GREEDY908-NEXT: v_accvgpr_read_b32 v23, a42 +; GREEDY908-NEXT: v_accvgpr_read_b32 v24, a41 +; GREEDY908-NEXT: v_accvgpr_read_b32 v25, a40 +; GREEDY908-NEXT: v_accvgpr_read_b32 v26, a39 +; GREEDY908-NEXT: v_accvgpr_read_b32 v27, a38 +; GREEDY908-NEXT: v_accvgpr_read_b32 v28, a37 +; GREEDY908-NEXT: v_accvgpr_read_b32 v29, a36 +; GREEDY908-NEXT: v_accvgpr_read_b32 v30, a35 +; GREEDY908-NEXT: v_accvgpr_read_b32 v31, a34 +; GREEDY908-NEXT: v_accvgpr_read_b32 v32, a33 +; GREEDY908-NEXT: v_accvgpr_read_b32 v33, a32 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v31 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v32 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v33 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v30 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v29 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v28 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v27 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v26 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v25 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v24 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v23 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v22 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v21 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v20 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v19 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v17 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v16 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v14 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v11 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v10 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v8 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v7 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v4 +; GREEDY908-NEXT: v_mov_b32_e32 v27, 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 7 @@ -165,57 +164,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 -; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a31 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a19 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[0:3], s[34:35] offset:96 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a23 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a30 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a29 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a28 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a11 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a15 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[1:4], s[34:35] offset:112 +; GREEDY908-NEXT: v_accvgpr_read_b32 v21, a3 +; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a16 +; GREEDY908-NEXT: v_accvgpr_read_b32 v25, a7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a20 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a10 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a9 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a14 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a13 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a12 +; GREEDY908-NEXT: v_accvgpr_read_b32 v20, a2 +; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a0 +; GREEDY908-NEXT: v_accvgpr_read_b32 v24, a6 +; GREEDY908-NEXT: v_accvgpr_read_b32 v23, a5 +; GREEDY908-NEXT: v_accvgpr_read_b32 v22, a4 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[2:5], s[34:35] offset:64 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[6:9], s[34:35] offset:80 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[10:13], s[34:35] offset:32 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[14:17], s[34:35] offset:48 +; GREEDY908-NEXT: global_store_dwordx4 v27, v[18:21], s[34:35] +; GREEDY908-NEXT: global_store_dwordx4 v27, v[22:25], s[34:35] offset:16 ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_32x32x1f32: @@ -640,49 +624,72 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 -; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s0 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 -; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v19 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v19 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v19 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v19 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v19 +; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v19, s15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v19 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 7 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 -; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 +; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a29 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a28 +; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a27 +; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a26 +; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a25 +; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a24 +; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a23 +; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a22 +; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a21 +; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a20 +; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a19 +; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a18 +; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a17 +; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a16 +; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v16 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v18 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v15 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v14 +; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v12 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v11 +; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v10 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v9 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v8 +; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v7 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v5 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index 416a601797617..1f26ba34b96e9 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -5919,22 +5919,22 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -5953,16 +5953,16 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -5985,8 +5985,8 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 @@ -6005,7 +6005,7 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -10996,357 +10996,357 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v13 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v28 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v31, v33, v32, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v11 -; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v31, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v26 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v51, v51 -; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v48, v39, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v69, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 -; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v35, v32, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v33 -; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v70, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v38, v34, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v71, 16, v23 +; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v81, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v83, 0xffff0000, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v36 -; GFX10-NEXT: v_cndmask_b32_e32 v35, v39, v33, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v34 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, v31, v38 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v26 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v53, v52, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v35 -; GFX10-NEXT: v_cmp_lt_f32_e64 s4, v39, v48 -; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v31, v31 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v22 +; GFX10-NEXT: v_cndmask_b32_e32 v35, v37, v33, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v36 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v86, v86 +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v33 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, v51, v52 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v26 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v86, v86 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v37 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v8 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v86, v86 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e32 v38, v64, v55, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v86, v86 +; GFX10-NEXT: v_cmp_lt_f32_e64 s6, v65, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v38 +; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v48, v66, v64, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v24 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v87, v39, v37, s6 +; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v86, v86 +; GFX10-NEXT: v_and_b32_e32 v86, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v64, v64, v48, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v48 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v86, v86 +; GFX10-NEXT: v_cndmask_b32_e32 v66, v80, v71, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v64 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v66 +; GFX10-NEXT: v_cndmask_b32_e32 v71, v71, v66, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX10-NEXT: v_cmp_lt_f32_e64 s8, v70, v80 +; GFX10-NEXT: v_lshrrev_b32_e32 v70, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v71 +; GFX10-NEXT: v_cndmask_b32_e32 v83, v85, v84, vcc_lo ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v49, v50 -; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v52, v38, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v39, v39 -; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v38 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v50, v49, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v31, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v39 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v49, v39, s6 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v52, v52 -; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v54, v53, s6 -; GFX10-NEXT: v_cmp_lt_f32_e64 s6, v51, v55 -; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v52, v52 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v50 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v49 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v53, v49, s7 -; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v55, v55 -; GFX10-NEXT: v_cmp_lt_f32_e64 s8, v31, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v65, v64, s7 -; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 +; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX10-NEXT: v_cndmask_b32_e64 v52, v84, v83, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v83 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v49, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v52 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v51, v50, s5 +; GFX10-NEXT: v_cmp_lt_f32_e64 s5, v53, v54 +; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX10-NEXT: v_cmp_lt_f32_e64 s10, v85, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v50, v50, v49, s7 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v49 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v53, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v53, v64, v55, s7 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v65, v65 -; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v53 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v67, v66, s7 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v52 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v64, v64 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v65 -; GFX10-NEXT: v_cmp_lt_f32_e64 s9, v54, v67 -; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v66, v65, s7 +; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v53, v65, v54, s7 ; GFX10-NEXT: v_cmp_lt_f32_e64 s7, v68, v69 -; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v54, v54 -; GFX10-NEXT: v_lshrrev_b32_e32 v69, 16, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v64 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v67, v66, s10 -; GFX10-NEXT: v_and_b32_e32 v67, 0xffff0000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v68, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v70, v69, s10 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v67, v67 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v54 -; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v66, v66, v54, s10 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v71, v71 -; GFX10-NEXT: v_lshrrev_b32_e32 v71, 16, v19 -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v69, v68, s10 -; GFX10-NEXT: v_and_b32_e32 v69, 0xffff0000, v3 -; GFX10-NEXT: v_cmp_lt_f32_e64 s11, v70, v81 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v67 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v69, v69 -; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX10-NEXT: v_cmp_lt_f32_e64 s12, v82, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v71, s10 -; GFX10-NEXT: v_cmp_lt_f32_e64 s10, v31, v51 -; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 -; GFX10-NEXT: v_lshrrev_b32_e32 v80, 16, v18 -; GFX10-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v69 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v71, v69, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v70, v70 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v81, v80, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v82, v82 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v17 -; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v80, v80, v70, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v71, v71 -; GFX10-NEXT: v_and_b32_e32 v71, 0xffff0000, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v82, v81, s13 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v71, v71 -; GFX10-NEXT: v_cmp_lt_f32_e64 s13, v31, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v70 -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v71, v81, v82, s14 -; GFX10-NEXT: v_cmp_lt_f32_e64 s14, v31, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v82 -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v71 -; GFX10-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s15, v31, v81 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v81, 16, v16 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v31, v31 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v83, v83, v81, s16 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v31, v31 -; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v81, v81, v83, s16 +; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v69, 16, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v54, v54, v53, s9 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v53 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v68, v68 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v68, v70, v69, s9 +; GFX10-NEXT: v_cmp_lt_f32_e64 s9, v82, v81 +; GFX10-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v18 +; GFX10-NEXT: v_cmp_lt_f32_e64 s12, v65, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v69, v69, v68, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v68 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v81, v81 +; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v81, v84, v82, s11 +; GFX10-NEXT: v_cmp_lt_f32_e64 s11, v51, v67 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v80, v82, v81, s13 ; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v81 -; GFX10-NEXT: v_cmp_lt_f32_e64 s16, v31, v84 -; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v14 -; GFX10-NEXT: v_lshrrev_b32_e32 v84, 16, v30 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v31, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v84, s17 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v51, v65, v67, s13 +; GFX10-NEXT: v_cmp_lt_f32_e64 s13, v70, v85 +; GFX10-NEXT: v_and_b32_e32 v70, 0xffff0000, v17 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v16 +; GFX10-NEXT: v_cmp_lt_f32_e64 s14, v84, v82 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v84, 0xffff0000, v16 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v70, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v51 +; GFX10-NEXT: v_cndmask_b32_e64 v70, v82, v85, s15 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v84, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v67, v67, v51, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v84, v85, v70, s15 +; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v84 +; GFX10-NEXT: v_cmp_lt_f32_e64 s15, v65, v82 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX10-NEXT: v_cmp_lt_f32_e64 s16, v85, v86 +; GFX10-NEXT: v_lshrrev_b32_e32 v85, 16, v14 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v65, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v96, v84, v70, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v85, v82, s17 ; GFX10-NEXT: v_and_b32_e32 v85, 0xffff0000, v30 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v85, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v84, v84, v31, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v84 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v82, v82, v65, s17 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v82 ; GFX10-NEXT: v_cmp_lt_f32_e64 s17, v85, v86 -; GFX10-NEXT: v_lshrrev_b32_e32 v86, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v85, v84, v31, s17 -; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v31 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v31, s17 -; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v31, v84, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v85 -; GFX10-NEXT: v_cmp_eq_f32_e64 s17, 0, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v84, v37, v32, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v32 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v31, s17 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v84, v32, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v37 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v32, v37, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v36, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v86, v36, v34, s4 ; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v34 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v39 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v37, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v85, v82, v65, s17 +; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v65 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v85, v65, s17 +; GFX10-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v82 +; GFX10-NEXT: v_cndmask_b32_e64 v65, v65, v82, s17 +; GFX10-NEXT: v_cndmask_b32_e32 v82, v32, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v31 +; GFX10-NEXT: v_cndmask_b32_e32 v31, v82, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v82 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v86, v34, s4 ; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v36 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v34, v36, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v35, v33, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33 -; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v38 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v36 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX10-NEXT: v_cndmask_b32_e64 v35, v48, v38, s6 -; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v38, v35, v38, s4 -; GFX10-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v38, v48, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v35 -; GFX10-NEXT: v_cmp_eq_f32_e64 s4, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v50, v39, s8 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v65 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v48, v39, s5 -; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v50 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v50, s5 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v48 -; GFX10-NEXT: v_cmp_eq_f32_e64 s5, 0, v50 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v52, v49, s9 -; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v50, v49, s6 -; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v52, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v50 -; GFX10-NEXT: v_cmp_eq_f32_e64 s6, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v53, v55, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v32, v82, v32, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v34, v31, v36, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v86 +; GFX10-NEXT: v_cndmask_b32_e64 v36, v35, v33, s5 +; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v33 +; GFX10-NEXT: v_cmp_eq_f32_e64 s4, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v36, v33, s5 +; GFX10-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v35 +; GFX10-NEXT: v_cndmask_b32_e64 v33, v86, v34, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v35, v31, v35, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v36 +; GFX10-NEXT: v_cmp_eq_f32_e64 s5, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, v37, s6 +; GFX10-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v39 +; GFX10-NEXT: v_cndmask_b32_e64 v34, v36, v35, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v31, v39, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v87 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v55, v38, s7 +; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v38 +; GFX10-NEXT: v_cmp_eq_f32_e64 s6, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v39, v38, s7 ; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v52, v55, s7 -; GFX10-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v53 -; GFX10-NEXT: v_cndmask_b32_e64 v53, v55, v53, s7 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v52 -; GFX10-NEXT: v_cmp_eq_f32_e64 s7, 0, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v64, v65, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v69 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v52, v53, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v55, v65, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v35, v87, v37, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v38, v31, v55, s7 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e64 v55, v64, v48, s8 +; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v48 +; GFX10-NEXT: v_cmp_eq_f32_e64 s7, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v55, v48, s8 ; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v64 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v65, v64, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v65, v66, v54, s11 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v54 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v54, s8 -; GFX10-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v54, v66, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v65 -; GFX10-NEXT: v_cmp_eq_f32_e64 s8, 0, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v66, v67, v68, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v66, v68, s9 -; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v67 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v68, v67, s9 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v66 -; GFX10-NEXT: v_cmp_eq_f32_e64 s9, 0, v68 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v51, v69, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v68, v69, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v69, v51, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v70, s14 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v70 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v69, v70, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v70, v70, v80, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v80, v71, v82, s15 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v82 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v80, v82, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v71 -; GFX10-NEXT: v_cndmask_b32_e64 v71, v82, v71, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v81, v83, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v36, v39, v38, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v48, v31, v64, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e64 v64, v71, v66, s9 +; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v66 +; GFX10-NEXT: v_cmp_eq_f32_e64 s8, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v64, v66, s9 +; GFX10-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v71 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v55, v48, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v66, v31, v71, s9 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v64 +; GFX10-NEXT: v_cndmask_b32_e64 v71, v52, v83, s10 ; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v82 -; GFX10-NEXT: v_cndmask_b32_e64 v83, v82, v83, s10 -; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v81 -; GFX10-NEXT: v_cndmask_b32_e64 v81, v83, v81, s10 -; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v85, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v14 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v85, v85 -; GFX10-NEXT: v_cndmask_b32_e64 v85, v14, v30, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v14, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v84 -; GFX10-NEXT: v_cndmask_b32_e64 v87, v30, v85, s11 -; GFX10-NEXT: v_cmp_eq_f32_e64 s12, 0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v35, v38, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v35, v50, v49, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v65, v54, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v80 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v84, v32, s12 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v37 -; GFX10-NEXT: v_and_b32_e32 v84, 0xffff0000, v15 -; GFX10-NEXT: v_cmp_eq_f32_e64 s12, 0, v32 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v37, v34, s12 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v48, v39, s5 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v68 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v69 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v55, v64, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX10-NEXT: v_cmp_eq_f32_e64 s9, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v71, v83, s10 +; GFX10-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v52 +; GFX10-NEXT: v_cndmask_b32_e64 v83, v50, v49, s11 +; GFX10-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v49 +; GFX10-NEXT: v_cndmask_b32_e64 v38, v64, v66, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v52, v31, v52, s10 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v71 +; GFX10-NEXT: v_cmp_eq_f32_e64 s10, 0, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v83, v49, s11 +; GFX10-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v71, v52, s10 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v96 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v31, v50, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v83 +; GFX10-NEXT: v_cndmask_b32_e64 v50, v54, v53, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v53 +; GFX10-NEXT: v_cmp_eq_f32_e64 s11, 0, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v86, 16, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v50, v53, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v48, v83, v49, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v53, v31, v54, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v54, v69, v68, s13 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v68 +; GFX10-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v87, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v54, v68, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v68, v31, v69, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v69, v80, v81, s14 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v81 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v69, v81, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v80 +; GFX10-NEXT: v_cndmask_b32_e64 v81, v67, v51, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v80, v31, v80, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v81 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v81, v51, s12 +; GFX10-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v67 +; GFX10-NEXT: v_cndmask_b32_e64 v51, v31, v67, s12 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v96, v70, s13 +; GFX10-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v70, v31, v84, s13 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v85 +; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v14 +; GFX10-NEXT: v_cmp_eq_f32_e64 s13, 0, v31 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v84, v84 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v85, v65, s13 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v66, v14, v30, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v85, 16, v30 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v85, v85 +; GFX10-NEXT: v_cndmask_b32_e64 v83, v30, v66, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v83 -; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v83 -; GFX10-NEXT: v_cndmask_b32_e64 v64, v15, v83, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v66, v67, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v86, v50, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v64 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v50, v54, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v55, v83, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v55 -; GFX10-NEXT: v_cndmask_b32_e32 v39, v68, v51, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e32 v48, v69, v70, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v53, v54, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v66 -; GFX10-NEXT: v_cndmask_b32_e32 v65, v55, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v50, v80, v71, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v54, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v65, v64, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v65 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v67 +; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v67 +; GFX10-NEXT: v_cndmask_b32_e64 v84, v15, v67, s12 +; GFX10-NEXT: v_cndmask_b32_e32 v82, v82, v49, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v50, v53, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v82 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v84 +; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v82, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX10-NEXT: v_cndmask_b32_e32 v53, v67, v84, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v87 +; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v53 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v54, v68, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v69, v80, vcc_lo +; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v54, v49, v82, vcc_lo +; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v67 +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v84, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX10-NEXT: v_cndmask_b32_e32 v50, v81, v51, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82 +; GFX10-NEXT: v_cndmask_b32_e32 v51, v54, v82, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84 +; GFX10-NEXT: v_cndmask_b32_e32 v64, v55, v84, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49 ; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v49, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GFX10-NEXT: v_cndmask_b32_e32 v53, v64, v53, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX10-NEXT: v_cndmask_b32_e32 v49, v54, v49, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e32 v52, v82, v81, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v87 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v65, v54, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v85 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v52, v96, v70, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX10-NEXT: v_cndmask_b32_e32 v51, v55, v53, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v83 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v53 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v28 +; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v53 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v87, v85, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v53, v83, v66, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v85 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v85, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66 +; GFX10-NEXT: v_cndmask_b32_e32 v55, v53, v66, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v55, v87, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83 +; GFX10-NEXT: v_cndmask_b32_e32 v28, v55, v83, vcc_lo ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v64 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54 @@ -11377,13 +11377,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX10-NEXT: v_perm_b32 v13, v14, v13, 0x5040100 +; GFX10-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v53, v12, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v11 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; GFX10-NEXT: v_perm_b32 v14, v31, v28, 0x5040100 -; GFX10-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX10-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v25 @@ -11409,7 +11408,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v53 -; GFX10-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v53, v10, vcc_lo ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 ; GFX10-NEXT: v_cndmask_b32_e32 v27, v25, v9, vcc_lo @@ -11422,7 +11421,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v53, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25 -; GFX10-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 @@ -11444,7 +11443,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX10-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo @@ -11470,13 +11469,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6 -; GFX10-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22 -; GFX10-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 @@ -11488,7 +11487,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v5 -; GFX10-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v39, v6, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v21 @@ -11522,7 +11521,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; GFX10-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v5, v48, v5, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 @@ -11538,10 +11537,11 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v17 -; GFX10-NEXT: v_perm_b32 v3, v39, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX10-NEXT: v_perm_b32 v15, v49, v51, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v2 @@ -11585,10 +11585,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX10-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX10-NEXT: v_perm_b32 v2, v48, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v30, v2, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v15, v49, v51, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v14, v31, v28, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16: @@ -12212,365 +12212,377 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v52, v52, v51, s1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v38, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v30 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v48, v48, v39, s0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v80, v71, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v84, v83, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v96, v87, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v100, v99, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s2, v54, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v112, v103, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v116, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v64, v64, v55, s2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v128, v119, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v132, v131, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v96 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v147, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v48, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v39, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v49, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v48, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v49 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v38, v53 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v51, v50 :: v_dual_lshlrev_b32 v53, 16, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v48, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v52, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v66, v65 :: v_dual_and_b32 v66, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v55 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v69, v68 :: v_dual_and_b32 v70, 0xffff0000, v25 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v67 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v68, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v144, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v64, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v68, v69 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v82, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v96, v87, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 16, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v98 :: v_dual_and_b32 v96, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v99, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v81, v81, v70 :: v_dual_lshlrev_b32 v100, 16, v97 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v85, v87, v86 :: v_dual_lshlrev_b32 v96, 16, v81 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v85 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v98, v97 :: v_dual_lshlrev_b32 v98, 16, v86 +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v84, v96 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v84, v81, v70 :: v_dual_lshlrev_b32 v101, 16, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v98, v99 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v96, v85, v86 :: v_dual_and_b32 v99, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v100, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v87, v97, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v99, v101, v100 :: v_dual_lshlrev_b32 v80, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v99, v84, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v100, v99, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v115, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v114, v101, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v114, v118, v117 :: v_dual_and_b32 v115, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v116, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v118, 16, v101 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v116, v100, v99 :: v_dual_lshlrev_b32 v119, 16, v113 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v115, v117, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v118, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v115 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v118, v113, v101, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v117, v129, v128, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v119, v130 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v119, v115, v114 :: v_dual_lshlrev_b32 v130, 16, v117 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v117, v131, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v119, v135, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e64 s0, v49, v130 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v117 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v37, v129 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v129, v51, v52, s0 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v55 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v131 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v71 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v132 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 16, v87 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v81, v134 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v80, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v85, v135 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 16, v103 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v97, v144 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v101, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97 -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v112, v146 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v114, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v114, v115, v98, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v116, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v116, v117, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v118, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v118, v119, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v128, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v128, v38, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v133, v135, v134 :: v_dual_lshlrev_b32 v102, 16, v84 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v135, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v144, v146, v145, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v128, v128, v117 :: v_dual_lshlrev_b32 v147, 16, v144 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v132, v134, v133 :: v_dual_lshlrev_b32 v135, 16, v128 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v134, v145, v144 :: v_dual_lshlrev_b32 v145, 16, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v130, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v134 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v130, v128, v117, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v145, v146 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v135, v132, v133, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v147, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v14 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v145, v134, v144 :: v_dual_lshlrev_b32 v148, 16, v30 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v37, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v48, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v38, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v147, v14, v30, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v148, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v148, v30, v147 :: v_dual_lshlrev_b32 v103, 16, v96 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v129, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v39, v52, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v129 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v69, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v81, v80, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v97, v84, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v101, v86, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v112, v96, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v118, v102, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v128, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v128 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v64, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v68, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114 -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v80, v83, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v52, v68, v55, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v67 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v55, v69, v67, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v64, v84, v70, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v86 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v67, v96, v86, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v97 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v70, v98, v97, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v99 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v85 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v86, v116, v99, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v101 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v85, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v82, v87, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v84, v99, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v97, v118, v101, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v114 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v70, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v99, v119, v114, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v117 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v86, v100, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v96, v113, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v101, v130, v117, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v133 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v97, v113, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v81 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v100, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v35, v119, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v114, v135, v133, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v144 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v99, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v117, v145, v144, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v49 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v101, v128, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v132 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v49, v14, v49, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v114, v132, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v135 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v87, v117, v134, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v34, v36, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v38, v37 :: v_dual_lshlrev_b32 v129, 16, v116 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v71 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v39, v49 :: v_dual_lshlrev_b32 v131, 16, v118 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v80 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v53, v54 :: v_dual_lshlrev_b32 v150, 16, v145 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v36, v68, v52 :: v_dual_and_b32 v53, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v83 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v129, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v53, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v65, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v133 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v81, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v135 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v85, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v144 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v97, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v145 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v101, v71, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v15, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v84, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v103 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v96, v65 :: v_dual_lshlrev_b32 v50, 16, v130 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v112 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v98, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v129 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v116, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v15, v31, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v116 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v54 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v146 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v112, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v118, v70, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v33, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52 -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v114, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v148 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v116, v83, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v64 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v52, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v67 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v64 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v53, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v65 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v118, v84, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v54 :: v_dual_lshlrev_b32 v64, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 16, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v149 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v119, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v130, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v64 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v52, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v50, v135, v86 :: v_dual_lshlrev_b32 v65, 16, v64 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v54, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v52 :: v_dual_lshlrev_b32 v54, 16, v55 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v128, v86, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v65, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v147 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v145, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v148, v147 :: v_dual_lshlrev_b32 v65, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v147 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v54, v147 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v148 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v148, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v54 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v28 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v29, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v54, v53 :: v_dual_lshlrev_b32 v64, 16, v55 ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v66, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v27 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v28, v12 :: v_dual_lshlrev_b32 v65, 16, v27 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 @@ -12588,33 +12600,32 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v54, v12 :: v_dual_lshlrev_b32 v55, 16, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v28 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v25 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v54, 16, v26 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v9 @@ -12622,64 +12633,63 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v25, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v26, 16, v8 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v24, v8 :: v_dual_lshlrev_b32 v29, 16, v7 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v23, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v23, v7 :: v_dual_lshlrev_b32 v28, 16, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v26, v8 :: v_dual_lshlrev_b32 v25, 16, v22 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc_lo @@ -12687,48 +12697,49 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v5 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v21 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v21, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v20, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v20, v4 :: v_dual_lshlrev_b32 v25, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4 @@ -12740,48 +12751,49 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v24, v3 :: v_dual_lshlrev_b32 v20, 16, v2 ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v31, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v20, 16, v16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v16 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v20 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v16, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 @@ -12789,6 +12801,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo @@ -12797,20 +12810,19 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v2, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v51, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -13522,478 +13534,479 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v9 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v52, v52, v51, s1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v13 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v38, v38 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v9 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v30 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v48, v48, v39, s0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v30 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v80, v71, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v84, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v96, v87, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v25 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v100, v99, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v16 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s2, v54, v54 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v112, v103, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v116, v115, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v21 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v64, v64, v55, s2 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v24 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v8 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v9 ; GFX12-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v128, v119, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v20 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v132, v131, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v96 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v33, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v22 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v19 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v8 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v20 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v34, v35 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v18 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v16 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v147, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v17 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v48, v39, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v39, v50, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v50 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v49, v48, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v48, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v49 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v38, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v38, v51, v50 :: v_dual_lshlrev_b32 v53, 16, v52 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v80 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v48, v54, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v55 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 16, v84 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v16 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v66, v65 :: v_dual_and_b32 v66, 0xffff0000, v26 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v55 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v99, v99, v84, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v67, v69, v68 :: v_dual_and_b32 v70, 0xffff0000, v25 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v67 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v115, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v82 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v68, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v69 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v55 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v65 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v144, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v68, v69 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v117, v131, v100, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 16, v86 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v71 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v119, v135, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v66, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e64 s0, v49, v130 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v82, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v70 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v30 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v70 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v117 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v96, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v37, v129 -; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v129, v51, v52, s0 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v98 :: v_dual_and_b32 v96, 0xffff0000, v23 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v99, 0xffff0000, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v55 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v131 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v81, v81, v70 :: v_dual_lshlrev_b32 v100, 16, v97 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v71 -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v132 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v85, v87, v86 :: v_dual_lshlrev_b32 v96, 16, v81 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v68 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v85 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v68, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v133 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 16, v87 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v98, v97 :: v_dual_lshlrev_b32 v98, 16, v86 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v84, v96 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65 -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v81, v134 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v84, v81, v70 :: v_dual_lshlrev_b32 v101, 16, v87 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v98, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v85, v135 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 16, v103 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v96, v85, v86 :: v_dual_and_b32 v99, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v100, v101 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v21 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v97, v144 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v87, v97, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98 -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v101, v145 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v115 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v99, v101, v100 :: v_dual_lshlrev_b32 v80, 16, v53 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v21 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v99 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v100, v99, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v20 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97 -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v112, v146 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v114, v147 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v119 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v114, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v114, v115, v98, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v116, v14 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v114, v101, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v116, v117, v100, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v118, v30 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v114, v118, v117 :: v_dual_and_b32 v115, 0xffff0000, v19 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v116, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v69 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v118, 16, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v118, v119, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v128, v49 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v116, v100, v99 :: v_dual_lshlrev_b32 v119, 16, v113 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v115, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v128, v38, v34, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v115, v117, v114, vcc_lo +; GFX12-FAKE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v118, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v39 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v114 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v115 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v118, v113, v101, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v48, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v117, v129, v128, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v119, v130 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v129, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v129 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v119, v115, v114 :: v_dual_lshlrev_b32 v130, 16, v117 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v64, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v133, v135, v134 :: v_dual_lshlrev_b32 v102, 16, v84 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v135, 0xffff0000, v17 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v65, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v144, v146, v145, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v16 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v69, v70, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v128, v128, v117 :: v_dual_lshlrev_b32 v147, 16, v144 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v81, v80, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v132, v134, v133 :: v_dual_lshlrev_b32 v135, 16, v128 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v134, v145, v144 :: v_dual_lshlrev_b32 v145, 16, v133 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v132 +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v130, v135 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v134 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v97, v84, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v130, v128, v117, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v145, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v101, v86, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v135, v132, v133, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v147, v148 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v98 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v14 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v112, v96, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v145, v134, v144 :: v_dual_lshlrev_b32 v148, 16, v30 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v34, v37, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v38, v50, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v118, v102, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v51, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v147, v14, v30, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v148, v148 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v128, v34, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v148, v30, v147 :: v_dual_lshlrev_b32 v103, 16, v96 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v51, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v128 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v39, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v52, v68, v55, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v67 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v64, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v55, v69, v67, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v70 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v54, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v64, v84, v70, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v86 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v68, v67, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v65, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v67, v96, v86, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v97 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114 -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v81 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v70, v98, v97, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v99 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v80, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v85 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v86, v116, v99, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v101 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v68, v82, v87, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v85, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v97, v118, v101, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v114 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v84, v99, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v70, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v99, v119, v114, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v117 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v86, v100, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v101, v130, v117, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v133 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v96, v113, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v97, v113, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v114, v135, v133, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v144 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v115, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v81 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v99, v115, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v128 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v15 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v117, v145, v144, s0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v49 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v100, v117, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v101, v128, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v132 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v49, v14, v49, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v84, v35, v119, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v114, v132, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v134 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v135 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v87, v117, v134, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v130, v14, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v34, v36, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v36, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v38, v37 :: v_dual_lshlrev_b32 v129, 16, v116 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v71 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v129, v39, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v34, v39, v49 :: v_dual_lshlrev_b32 v131, 16, v118 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v80 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v53, v49, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v132 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v31 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v35, v53, v54 :: v_dual_lshlrev_b32 v150, 16, v145 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v82 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v65, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v133 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v36, v68, v52 :: v_dual_and_b32 v53, 0xffff0000, v31 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v83 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v64, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v69, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v81, v67, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v135 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v84, v64, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v103 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v85, v68, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v144 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v39, v96, v65 :: v_dual_lshlrev_b32 v50, 16, v130 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v112 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v97, v70, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v145 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v98, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v129 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v101, v71, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v116, v67, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v15, v31, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v15, v31, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 16, v116 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v55 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v112 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v33 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v146 +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v131 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v33 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v112, v80, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v118, v70, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v33, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v118 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52 -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v147 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v53 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v31, v54 :: v_dual_lshlrev_b32 v64, 16, v52 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 16, v119 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v53 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v149 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v114, v82, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v148 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v119, v81, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v116, v83, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v64 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v130, v85, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v52, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v67 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v64 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v52, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v53, v55, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v102 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v65 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v53, v54, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v118, v84, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v50, v135, v86 :: v_dual_lshlrev_b32 v65, 16, v64 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v54, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v52, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v52 :: v_dual_lshlrev_b32 v54, 16, v55 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v67 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v53, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v148 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v13 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v64, v33, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v55, v33, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v150 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v128, v86, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v68 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v145, v87, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v65, v53, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v66 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v29 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v51, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v12 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v53 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v54, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v13 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v148, v147 :: v_dual_lshlrev_b32 v65, 16, v13 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v28 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v147 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v53, v54, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v54, v147 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66 +; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v148 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v66, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v148, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v64 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v53 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v54 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v29, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v54, v53 :: v_dual_lshlrev_b32 v64, 16, v55 ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v66, v65 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v27 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v28, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v28, v12 :: v_dual_lshlrev_b32 v65, 16, v27 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo @@ -14015,31 +14028,33 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26 -; GFX12-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v54, v12 :: v_dual_lshlrev_b32 v55, 16, v26 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v25 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v54, 16, v26 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo @@ -14047,9 +14062,6 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v10 -; GFX12-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v9 @@ -14060,8 +14072,6 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54 @@ -14072,57 +14082,57 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v26, 16, v8 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v10, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25 +; GFX12-FAKE16-NEXT: v_perm_b32 v14, v14, v53, 0x5040100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX12-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX12-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v24, v24, v8 :: v_dual_lshlrev_b32 v29, 16, v7 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8 -; GFX12-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v26 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v24, v8, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v23, v7, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v23, v7 :: v_dual_lshlrev_b32 v28, 16, v6 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v26, v8 :: v_dual_lshlrev_b32 v25, 16, v22 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo @@ -14130,11 +14140,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GFX12-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v22 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v27 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd @@ -14144,23 +14153,22 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX12-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v27, v7, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd @@ -14168,14 +14176,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v21 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; GFX12-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -14184,10 +14195,8 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v21, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo @@ -14195,10 +14204,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v19 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v20, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v20, v4 :: v_dual_lshlrev_b32 v25, 16, v3 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo @@ -14208,56 +14216,59 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v19, v3, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v24, v3 :: v_dual_lshlrev_b32 v20, 16, v2 ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX12-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_perm_b32 v3, v31, v3, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v20, 16, v16 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v19 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v16 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v20 @@ -14265,9 +14276,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v16, v0, vcc_lo @@ -14278,6 +14287,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17 @@ -14289,23 +14299,23 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19 ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v52, v0, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v2, v32, v2, 0x5040100 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_perm_b32 v4, v15, v4, 0x5040100 ; GFX12-FAKE16-NEXT: v_perm_b32 v15, v33, v51, 0x5040100 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 558006d2b6957..c6f78044cc35d 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -4642,77 +4642,77 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v17 +; GFX7-SDAG-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v23 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v21 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v16 -; GFX7-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v24 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v21, v23 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v20 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v18 ; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v19 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v28 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v29 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v30 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v21 ; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v20 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v16 ; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v18 ; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v19 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v17 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v16f16: @@ -4725,46 +4725,46 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v17 +; GFX7-GISEL-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v21 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v17 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v22 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v21 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v23 ; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v16 -; GFX7-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v23 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v22 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v19 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v24 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v25 +; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v8, v18 -; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v9, v19 -; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v10, v20 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v26 +; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v18 +; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v8, v19 +; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v9, v20 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v27 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v27 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v28 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v28 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v29 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v29 +; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v10, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v30 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v30 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v11, v17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v12, v18 -; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v13, v19 -; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v14, v20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v11, v18 +; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v12, v19 +; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v13, v20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v14, v16 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -4781,8 +4781,8 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_min_f32_e32 v15, v15, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v15, v15, v17 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4791,27 +4791,26 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 ; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 ; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 @@ -4828,7 +4827,8 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v23, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v19, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v7, v7, v15 ; GFX8-SDAG-NEXT: v_min_f16_e32 v6, v6, v14 ; GFX8-SDAG-NEXT: v_min_f16_e32 v5, v5, v13 @@ -4837,13 +4837,13 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v10 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v9 ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v19 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v24 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v22 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v20 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v18 ; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5790,97 +5790,102 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_max_f16_e32 v31, v0, v0 ; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v16, v16 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v1, v1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v31, v31, v32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v16, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v31, v31, v32 -; GFX8-GISEL-NEXT: v_min_f16_sdwa v0, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v16, v1, v1 ; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v17, v17 +; GFX8-GISEL-NEXT: v_min_f16_sdwa v0, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_min_f16_e32 v16, v33, v32 +; GFX8-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v17, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v16, v16, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v34, v2, v2 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v2, v2 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v18, v18 +; GFX8-GISEL-NEXT: v_max_f16_e32 v17, v18, v18 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v18, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v17, v17, v32 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v3, v3 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v3, v3 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v19, v19 +; GFX8-GISEL-NEXT: v_max_f16_e32 v18, v19, v19 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v19, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v18, v18, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v18, v33, v18 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v4, v4 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v3, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v4, v4 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v20, v20 +; GFX8-GISEL-NEXT: v_max_f16_e32 v19, v20, v20 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v20, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v19, v19, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v19, v33, v19 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v5, v5 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v4, v4, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v5, v5 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v21, v21 +; GFX8-GISEL-NEXT: v_max_f16_e32 v20, v21, v21 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v21, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v20, v20, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v20, v33, v20 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v6, v6 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v5, v5, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v6, v6 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v22, v22 +; GFX8-GISEL-NEXT: v_max_f16_e32 v21, v22, v22 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v22, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v21, v21, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v21, v33, v21 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v7, v7 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v6, v6, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v7, v7 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v23, v23 +; GFX8-GISEL-NEXT: v_max_f16_e32 v22, v23, v23 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v23, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v22, v22, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v22, v33, v22 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v8, v8 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v7, v7, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v8, v8 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v24, v24 +; GFX8-GISEL-NEXT: v_max_f16_e32 v23, v24, v24 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v24, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v23, v23, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v23, v33, v23 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v9, v9 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v8, v8, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v9, v9 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v25, v25 +; GFX8-GISEL-NEXT: v_max_f16_e32 v24, v25, v25 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v25, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v24, v24, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v24, v33, v24 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v10, v10 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v9, v9, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v10, v10 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v26, v26 +; GFX8-GISEL-NEXT: v_max_f16_e32 v25, v26, v26 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v26, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v25, v25, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v25, v33, v25 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v11, v11 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v10, v10, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v11, v11 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v27, v27 +; GFX8-GISEL-NEXT: v_max_f16_e32 v26, v27, v27 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v27, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v26, v26, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v26, v33, v26 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v12, v12 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v11, v11, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v12, v12 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v28, v28 +; GFX8-GISEL-NEXT: v_max_f16_e32 v27, v28, v28 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v28, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v27, v27, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v27, v33, v27 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v13, v13 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v12, v12, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v13, v13 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v29, v29 +; GFX8-GISEL-NEXT: v_max_f16_e32 v28, v29, v29 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v29, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v28, v28, v32 +; GFX8-GISEL-NEXT: v_min_f16_e32 v28, v33, v28 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v14, v14 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v13, v13, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v14, v14 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v30, v30 +; GFX8-GISEL-NEXT: v_max_f16_e32 v29, v30, v30 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v30, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v29, v33, v29 +; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v15, v15 ; GFX8-GISEL-NEXT: v_min_f16_sdwa v14, v14, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_min_f16_e32 v29, v29, v32 -; GFX8-GISEL-NEXT: v_max_f16_e32 v32, v15, v15 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8-GISEL-NEXT: v_max_f16_e32 v30, v32, v32 ; GFX8-GISEL-NEXT: v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_max_f16_sdwa v32, v32, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v17, v34, v17 +; GFX8-GISEL-NEXT: v_min_f16_e32 v30, v33, v30 +; GFX8-GISEL-NEXT: v_min_f16_sdwa v15, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v31, v0 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v16, v1 ; GFX8-GISEL-NEXT: v_or_b32_e32 v2, v17, v2 @@ -5896,12 +5901,7 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v12, v27, v12 ; GFX8-GISEL-NEXT: v_or_b32_e32 v13, v28, v13 ; GFX8-GISEL-NEXT: v_or_b32_e32 v14, v29, v14 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_max_f16_e32 v33, v30, v30 -; GFX8-GISEL-NEXT: v_max_f16_sdwa v30, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_min_f16_e32 v32, v32, v33 -; GFX8-GISEL-NEXT: v_min_f16_sdwa v15, v15, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-GISEL-NEXT: v_or_b32_e32 v15, v32, v15 +; GFX8-GISEL-NEXT: v_or_b32_e32 v15, v30, v15 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_minimumnum_v32f16: @@ -5911,6 +5911,7 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX900-SDAG-NEXT: v_pk_min_f16 v0, v0, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v17, v17 +; GFX900-SDAG-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GFX900-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-SDAG-NEXT: v_pk_min_f16 v1, v1, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v18, v18 @@ -5918,44 +5919,43 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-SDAG-NEXT: v_pk_min_f16 v2, v2, v16 ; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v19, v19 ; GFX900-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX900-SDAG-NEXT: v_pk_min_f16 v3, v3, v16 -; GFX900-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-SDAG-NEXT: v_pk_max_f16 v17, v20, v20 +; GFX900-SDAG-NEXT: v_pk_max_f16 v18, v20, v20 ; GFX900-SDAG-NEXT: v_pk_max_f16 v4, v4, v4 -; GFX900-SDAG-NEXT: v_pk_max_f16 v18, v21, v21 +; GFX900-SDAG-NEXT: v_pk_max_f16 v19, v21, v21 ; GFX900-SDAG-NEXT: v_pk_max_f16 v5, v5, v5 -; GFX900-SDAG-NEXT: v_pk_max_f16 v19, v22, v22 +; GFX900-SDAG-NEXT: v_pk_max_f16 v20, v22, v22 ; GFX900-SDAG-NEXT: v_pk_max_f16 v6, v6, v6 -; GFX900-SDAG-NEXT: v_pk_max_f16 v20, v23, v23 +; GFX900-SDAG-NEXT: v_pk_max_f16 v21, v23, v23 ; GFX900-SDAG-NEXT: v_pk_max_f16 v7, v7, v7 -; GFX900-SDAG-NEXT: v_pk_max_f16 v21, v24, v24 +; GFX900-SDAG-NEXT: v_pk_max_f16 v22, v24, v24 ; GFX900-SDAG-NEXT: v_pk_max_f16 v8, v8, v8 -; GFX900-SDAG-NEXT: v_pk_max_f16 v22, v25, v25 +; GFX900-SDAG-NEXT: v_pk_max_f16 v23, v25, v25 ; GFX900-SDAG-NEXT: v_pk_max_f16 v9, v9, v9 -; GFX900-SDAG-NEXT: v_pk_max_f16 v23, v26, v26 +; GFX900-SDAG-NEXT: v_pk_max_f16 v24, v26, v26 ; GFX900-SDAG-NEXT: v_pk_max_f16 v10, v10, v10 -; GFX900-SDAG-NEXT: v_pk_max_f16 v24, v27, v27 +; GFX900-SDAG-NEXT: v_pk_max_f16 v25, v27, v27 ; GFX900-SDAG-NEXT: v_pk_max_f16 v11, v11, v11 -; GFX900-SDAG-NEXT: v_pk_max_f16 v25, v28, v28 +; GFX900-SDAG-NEXT: v_pk_max_f16 v26, v28, v28 ; GFX900-SDAG-NEXT: v_pk_max_f16 v12, v12, v12 -; GFX900-SDAG-NEXT: v_pk_max_f16 v26, v29, v29 +; GFX900-SDAG-NEXT: v_pk_max_f16 v27, v29, v29 ; GFX900-SDAG-NEXT: v_pk_max_f16 v13, v13, v13 -; GFX900-SDAG-NEXT: v_pk_max_f16 v27, v30, v30 +; GFX900-SDAG-NEXT: v_pk_max_f16 v28, v30, v30 ; GFX900-SDAG-NEXT: v_pk_max_f16 v14, v14, v14 ; GFX900-SDAG-NEXT: v_pk_max_f16 v15, v15, v15 -; GFX900-SDAG-NEXT: v_pk_min_f16 v4, v4, v17 -; GFX900-SDAG-NEXT: v_pk_min_f16 v5, v5, v18 -; GFX900-SDAG-NEXT: v_pk_min_f16 v6, v6, v19 -; GFX900-SDAG-NEXT: v_pk_min_f16 v7, v7, v20 -; GFX900-SDAG-NEXT: v_pk_min_f16 v8, v8, v21 -; GFX900-SDAG-NEXT: v_pk_min_f16 v9, v9, v22 -; GFX900-SDAG-NEXT: v_pk_min_f16 v10, v10, v23 -; GFX900-SDAG-NEXT: v_pk_min_f16 v11, v11, v24 -; GFX900-SDAG-NEXT: v_pk_min_f16 v12, v12, v25 -; GFX900-SDAG-NEXT: v_pk_min_f16 v13, v13, v26 -; GFX900-SDAG-NEXT: v_pk_min_f16 v14, v14, v27 +; GFX900-SDAG-NEXT: v_pk_min_f16 v3, v3, v16 +; GFX900-SDAG-NEXT: v_pk_min_f16 v4, v4, v18 +; GFX900-SDAG-NEXT: v_pk_min_f16 v5, v5, v19 +; GFX900-SDAG-NEXT: v_pk_min_f16 v6, v6, v20 +; GFX900-SDAG-NEXT: v_pk_min_f16 v7, v7, v21 +; GFX900-SDAG-NEXT: v_pk_min_f16 v8, v8, v22 +; GFX900-SDAG-NEXT: v_pk_min_f16 v9, v9, v23 +; GFX900-SDAG-NEXT: v_pk_min_f16 v10, v10, v24 +; GFX900-SDAG-NEXT: v_pk_min_f16 v11, v11, v25 +; GFX900-SDAG-NEXT: v_pk_min_f16 v12, v12, v26 +; GFX900-SDAG-NEXT: v_pk_min_f16 v13, v13, v27 +; GFX900-SDAG-NEXT: v_pk_min_f16 v14, v14, v28 ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v16, v16 +; GFX900-SDAG-NEXT: v_pk_max_f16 v16, v17, v17 ; GFX900-SDAG-NEXT: v_pk_min_f16 v15, v15, v16 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5965,52 +5965,52 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX900-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v16, v16 ; GFX900-GISEL-NEXT: v_pk_min_f16 v0, v0, v16 -; GFX900-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v17, v17 +; GFX900-GISEL-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX900-GISEL-NEXT: v_pk_min_f16 v1, v1, v16 ; GFX900-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v18, v18 ; GFX900-GISEL-NEXT: v_pk_min_f16 v2, v2, v16 ; GFX900-GISEL-NEXT: v_pk_max_f16 v3, v3, v3 ; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v19, v19 -; GFX900-GISEL-NEXT: v_pk_min_f16 v3, v3, v16 -; GFX900-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX900-GISEL-NEXT: v_pk_max_f16 v4, v4, v4 -; GFX900-GISEL-NEXT: v_pk_max_f16 v17, v20, v20 +; GFX900-GISEL-NEXT: v_pk_max_f16 v18, v20, v20 ; GFX900-GISEL-NEXT: v_pk_max_f16 v5, v5, v5 -; GFX900-GISEL-NEXT: v_pk_max_f16 v18, v21, v21 +; GFX900-GISEL-NEXT: v_pk_max_f16 v19, v21, v21 ; GFX900-GISEL-NEXT: v_pk_max_f16 v6, v6, v6 -; GFX900-GISEL-NEXT: v_pk_max_f16 v19, v22, v22 +; GFX900-GISEL-NEXT: v_pk_max_f16 v20, v22, v22 ; GFX900-GISEL-NEXT: v_pk_max_f16 v7, v7, v7 -; GFX900-GISEL-NEXT: v_pk_max_f16 v20, v23, v23 +; GFX900-GISEL-NEXT: v_pk_max_f16 v21, v23, v23 ; GFX900-GISEL-NEXT: v_pk_max_f16 v8, v8, v8 -; GFX900-GISEL-NEXT: v_pk_max_f16 v21, v24, v24 +; GFX900-GISEL-NEXT: v_pk_max_f16 v22, v24, v24 ; GFX900-GISEL-NEXT: v_pk_max_f16 v9, v9, v9 -; GFX900-GISEL-NEXT: v_pk_max_f16 v22, v25, v25 +; GFX900-GISEL-NEXT: v_pk_max_f16 v23, v25, v25 ; GFX900-GISEL-NEXT: v_pk_max_f16 v10, v10, v10 -; GFX900-GISEL-NEXT: v_pk_max_f16 v23, v26, v26 +; GFX900-GISEL-NEXT: v_pk_max_f16 v24, v26, v26 ; GFX900-GISEL-NEXT: v_pk_max_f16 v11, v11, v11 -; GFX900-GISEL-NEXT: v_pk_max_f16 v24, v27, v27 +; GFX900-GISEL-NEXT: v_pk_max_f16 v25, v27, v27 ; GFX900-GISEL-NEXT: v_pk_max_f16 v12, v12, v12 -; GFX900-GISEL-NEXT: v_pk_max_f16 v25, v28, v28 +; GFX900-GISEL-NEXT: v_pk_max_f16 v26, v28, v28 ; GFX900-GISEL-NEXT: v_pk_max_f16 v13, v13, v13 -; GFX900-GISEL-NEXT: v_pk_max_f16 v26, v29, v29 +; GFX900-GISEL-NEXT: v_pk_max_f16 v27, v29, v29 ; GFX900-GISEL-NEXT: v_pk_max_f16 v14, v14, v14 -; GFX900-GISEL-NEXT: v_pk_max_f16 v27, v30, v30 +; GFX900-GISEL-NEXT: v_pk_max_f16 v28, v30, v30 ; GFX900-GISEL-NEXT: v_pk_max_f16 v15, v15, v15 -; GFX900-GISEL-NEXT: v_pk_min_f16 v4, v4, v17 -; GFX900-GISEL-NEXT: v_pk_min_f16 v5, v5, v18 -; GFX900-GISEL-NEXT: v_pk_min_f16 v6, v6, v19 -; GFX900-GISEL-NEXT: v_pk_min_f16 v7, v7, v20 -; GFX900-GISEL-NEXT: v_pk_min_f16 v8, v8, v21 -; GFX900-GISEL-NEXT: v_pk_min_f16 v9, v9, v22 -; GFX900-GISEL-NEXT: v_pk_min_f16 v10, v10, v23 -; GFX900-GISEL-NEXT: v_pk_min_f16 v11, v11, v24 -; GFX900-GISEL-NEXT: v_pk_min_f16 v12, v12, v25 -; GFX900-GISEL-NEXT: v_pk_min_f16 v13, v13, v26 -; GFX900-GISEL-NEXT: v_pk_min_f16 v14, v14, v27 +; GFX900-GISEL-NEXT: v_pk_min_f16 v3, v3, v16 +; GFX900-GISEL-NEXT: v_pk_min_f16 v4, v4, v18 +; GFX900-GISEL-NEXT: v_pk_min_f16 v5, v5, v19 +; GFX900-GISEL-NEXT: v_pk_min_f16 v6, v6, v20 +; GFX900-GISEL-NEXT: v_pk_min_f16 v7, v7, v21 +; GFX900-GISEL-NEXT: v_pk_min_f16 v8, v8, v22 +; GFX900-GISEL-NEXT: v_pk_min_f16 v9, v9, v23 +; GFX900-GISEL-NEXT: v_pk_min_f16 v10, v10, v24 +; GFX900-GISEL-NEXT: v_pk_min_f16 v11, v11, v25 +; GFX900-GISEL-NEXT: v_pk_min_f16 v12, v12, v26 +; GFX900-GISEL-NEXT: v_pk_min_f16 v13, v13, v27 +; GFX900-GISEL-NEXT: v_pk_min_f16 v14, v14, v28 ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v16, v16 +; GFX900-GISEL-NEXT: v_pk_max_f16 v16, v17, v17 ; GFX900-GISEL-NEXT: v_pk_min_f16 v15, v15, v16 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 7e3d5c97391e1..7081be89fd18d 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -3319,117 +3319,117 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[0:1] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_lo_u32 v3, v4, v3 ; SI-NEXT: v_mul_hi_u32 v10, v4, v2 -; SI-NEXT: v_mul_lo_u32 v12, v6, v1 -; SI-NEXT: v_mul_hi_u32 v13, v6, v0 -; SI-NEXT: v_mul_lo_u32 v17, v1, v4 -; SI-NEXT: v_mul_hi_u32 v18, v0, v4 ; SI-NEXT: v_mul_lo_u32 v11, v5, v2 +; SI-NEXT: v_mul_lo_u32 v12, v4, v2 +; SI-NEXT: v_mul_lo_u32 v2, v1, v4 +; SI-NEXT: v_mul_hi_u32 v15, v0, v4 +; SI-NEXT: v_mul_hi_u32 v17, v1, v4 +; SI-NEXT: v_mul_lo_u32 v16, v0, v5 +; SI-NEXT: v_mul_lo_u32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; SI-NEXT: v_mul_lo_u32 v13, v6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, v16, v2 +; SI-NEXT: v_mul_hi_u32 v16, v6, v0 ; SI-NEXT: v_mul_lo_u32 v7, v7, v0 -; SI-NEXT: v_mul_hi_u32 v16, v1, v4 -; SI-NEXT: v_mul_lo_u32 v15, v0, v5 +; SI-NEXT: v_mul_lo_u32 v6, v6, v0 ; SI-NEXT: v_mul_hi_u32 v14, v0, v5 -; SI-NEXT: v_mul_hi_u32 v19, v1, v5 +; SI-NEXT: v_add_i32_e64 v3, s[0:1], v10, v3 +; SI-NEXT: v_add_i32_e64 v10, s[0:1], v16, v13 +; SI-NEXT: v_add_i32_e64 v3, s[0:1], v3, v11 +; SI-NEXT: v_mul_hi_u32 v11, v1, v5 +; SI-NEXT: v_add_i32_e64 v7, s[0:1], v10, v7 ; SI-NEXT: v_mul_lo_u32 v5, v1, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 -; SI-NEXT: v_mul_lo_u32 v2, v4, v2 -; SI-NEXT: v_mul_lo_u32 v6, v6, v0 -; SI-NEXT: v_mul_lo_u32 v0, v0, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 +; SI-NEXT: v_add_i32_e64 v6, s[0:1], v6, v12 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v14, vcc +; SI-NEXT: v_mul_lo_u32 v1, v0, v4 +; SI-NEXT: v_addc_u32_e64 v0, vcc, v7, v3, s[0:1] +; SI-NEXT: v_add_i32_e32 v3, vcc, v15, v10 +; SI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; SI-NEXT: v_addc_u32_e32 v4, vcc, v4, v0, vcc +; SI-NEXT: buffer_store_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v10, 0 +; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_lo_u32 v3, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_mov_b32_e32 v9, v2 -; VI-NEXT: v_mul_lo_u32 v2, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] -; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 -; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 -; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: v_mul_lo_u32 v10, v4, v3 +; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 +; VI-NEXT: v_mul_lo_u32 v16, v5, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 +; VI-NEXT: v_mov_b32_e32 v10, v3 +; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v1, v4, v[10:11] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; VI-NEXT: v_mul_lo_u32 v7, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] +; VI-NEXT: v_mov_b32_e32 v10, v14 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] +; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13 +; VI-NEXT: v_mul_lo_u32 v14, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v15, v4 +; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc +; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] -; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX9-NEXT: v_mul_lo_u32 v16, v6, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v17, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; GFX9-NEXT: v_add3_u32 v3, v17, v3, v16 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index f4e5c276b8b75..c991f4577db9a 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v18, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 -; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] +; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v20, v3, v16 +; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19 +; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off diff --git a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir index bb9d22fedf38d..800c0c4f823b3 100644 --- a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir @@ -32,8 +32,6 @@ body: | ; GFX9-NEXT: successors: %bb.1(0x80000000) ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr16 ; GFX9-NEXT: undef [[COPY2:%[0-9]+]].sub15:vreg_512 = COPY $vgpr15 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub14:vreg_512 = COPY $vgpr14 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub13:vreg_512 = COPY $vgpr13 @@ -46,6 +44,8 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub6:vreg_512 = COPY $vgpr6 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY $vgpr5 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub4:vreg_512 = COPY $vgpr4 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr16 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY $vgpr3 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY $vgpr2 ; GFX9-NEXT: [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index f78168ba29ef1..0cd5beba93cf2 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -185,6 +185,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0xa4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] @@ -192,38 +193,39 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 ; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) -; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 -; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 -; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 -; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_add_f32_e32 v4, s23, v4 +; GFX900-NEXT: v_add_f32_e32 v3, s22, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s21, v2 ; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v8, s19, v8 +; GFX900-NEXT: v_add_f32_e32 v1, s20, v1 +; GFX900-NEXT: v_add_f32_e32 v7, s18, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s17, v6 +; GFX900-NEXT: v_add_f32_e32 v5, s16, v5 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_f32_e32 v12, s31, v12 +; GFX900-NEXT: v_add_f32_e32 v11, s30, v11 +; GFX900-NEXT: v_add_f32_e32 v10, s29, v10 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_f32_e32 v16, s27, v16 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 -; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 -; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 -; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 -; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 -; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 -; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 -; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 -; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 -; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_add_f32_e32 v9, s28, v9 +; GFX900-NEXT: v_add_f32_e32 v15, s26, v15 +; GFX900-NEXT: v_add_f32_e32 v14, s25, v14 ; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_add_f32_e32 v13, s24, v13 ; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 -; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 ; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 ; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 ; GFX900-NEXT: v_add_f32_e32 v21, s8, v21 @@ -343,66 +345,70 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v52, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v52, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v52, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v52, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v52, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v52, s[34:35] offset:80 ; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v24, s22 :: v_dual_mov_b32 v22, s18 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v28, s30 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v30, s24 :: v_dual_mov_b32 v31, s25 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s26 :: v_dual_mov_b32 v29, s31 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s13 :: v_dual_mov_b32 v26, s28 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v27, s29 :: v_dual_mov_b32 v25, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s3 :: v_dual_mov_b32 v42, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v46, s10 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v35, s5 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[0:1] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s27 :: v_dual_mov_b32 v45, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s11 :: v_dual_mov_b32 v43, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s2 :: v_dual_mov_b32 v40, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v36, s6 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v37, s7 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[20:21] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[22:23] +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v52, s[34:35] offset:64 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[30:31] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[28:29] +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v52, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[24:25] +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v52, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[48:49] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[50:51] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[46:47] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[40:41] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[42:43] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[28:31], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[24:27], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[20:23], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[16:19], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[12:15], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[8:11], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[4:7], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[0:3], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: @@ -412,59 +418,60 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: s_clause 0x3 ; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] ; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 ; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 ; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[28:29], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[30:31], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[24:25] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[44:45] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[46:47] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[26:27] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[48:49] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[50:51] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[30:31] +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[28:29] +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[34:35] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[38:39] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55] ; GFX1250-GISEL-NEXT: s_clause 0x7 ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 @@ -1442,6 +1449,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0xa4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] @@ -1449,38 +1457,39 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 ; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) -; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 -; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 -; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 -; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_mul_f32_e32 v4, s23, v4 +; GFX900-NEXT: v_mul_f32_e32 v3, s22, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s21, v2 ; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v8, s19, v8 +; GFX900-NEXT: v_mul_f32_e32 v1, s20, v1 +; GFX900-NEXT: v_mul_f32_e32 v7, s18, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s17, v6 +; GFX900-NEXT: v_mul_f32_e32 v5, s16, v5 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_mul_f32_e32 v12, s31, v12 +; GFX900-NEXT: v_mul_f32_e32 v11, s30, v11 +; GFX900-NEXT: v_mul_f32_e32 v10, s29, v10 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_mul_f32_e32 v16, s27, v16 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 -; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 -; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 -; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 -; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 -; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 -; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 -; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 -; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 -; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_mul_f32_e32 v9, s28, v9 +; GFX900-NEXT: v_mul_f32_e32 v15, s26, v15 +; GFX900-NEXT: v_mul_f32_e32 v14, s25, v14 ; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_mul_f32_e32 v13, s24, v13 ; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 -; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 ; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 ; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 ; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21 @@ -1600,66 +1609,70 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v52, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v52, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v52, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v52, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v52, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v52, s[34:35] offset:80 ; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v24, s22 :: v_dual_mov_b32 v22, s18 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v28, s30 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v30, s24 :: v_dual_mov_b32 v31, s25 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s26 :: v_dual_mov_b32 v29, s31 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s13 :: v_dual_mov_b32 v26, s28 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v27, s29 :: v_dual_mov_b32 v25, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s3 :: v_dual_mov_b32 v42, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v46, s10 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v35, s5 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[0:1] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s27 :: v_dual_mov_b32 v45, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s11 :: v_dual_mov_b32 v43, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s2 :: v_dual_mov_b32 v40, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v36, s6 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v37, s7 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[20:21] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[22:23] +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v52, s[34:35] offset:64 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[30:31] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[28:29] +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v52, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[26:27] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[24:25] +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v52, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[48:49] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[50:51] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[46:47] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[40:41] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[42:43] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[28:31], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[24:27], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[20:23], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[16:19], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[12:15], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[8:11], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[4:7], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[0:3], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: @@ -1669,59 +1682,60 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: s_clause 0x3 ; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] ; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 ; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 ; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[28:29], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[30:31], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[24:25] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[44:45] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[46:47] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[26:27] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[48:49] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[50:51] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[30:31] +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[28:29] +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[34:35] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[38:39] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55] ; GFX1250-GISEL-NEXT: s_clause 0x7 ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 @@ -2273,6 +2287,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0xa4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] @@ -2280,38 +2295,39 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 ; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) -; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 -; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 -; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 -; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_fma_f32 v4, v4, s23, s23 +; GFX900-NEXT: v_fma_f32 v3, v3, s22, s22 +; GFX900-NEXT: v_fma_f32 v2, v2, s21, s21 ; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 -; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 -; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v8, v8, s19, s19 +; GFX900-NEXT: v_fma_f32 v1, v1, s20, s20 +; GFX900-NEXT: v_fma_f32 v7, v7, s18, s18 +; GFX900-NEXT: v_fma_f32 v6, v6, s17, s17 +; GFX900-NEXT: v_fma_f32 v5, v5, s16, s16 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_fma_f32 v12, v12, s31, s31 +; GFX900-NEXT: v_fma_f32 v11, v11, s30, s30 +; GFX900-NEXT: v_fma_f32 v10, v10, s29, s29 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_fma_f32 v16, v16, s27, s27 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 ; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 ; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 ; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 -; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 -; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 -; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 -; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 -; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 -; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 -; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 -; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 -; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: v_fma_f32 v9, v9, s28, s28 +; GFX900-NEXT: v_fma_f32 v15, v15, s26, s26 +; GFX900-NEXT: v_fma_f32 v14, v14, s25, s25 ; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 +; GFX900-NEXT: v_fma_f32 v13, v13, s24, s24 ; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 ; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 -; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 ; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 +; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 ; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 ; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 ; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8 @@ -2431,65 +2447,69 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v52, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v52, s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v52, s[34:35] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v52, s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v52, s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v52, s[34:35] offset:80 ; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[28:29], s[30:31] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[30:31], s[24:25] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[24:25], s[22:23] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[26:27], s[28:29] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[8:9] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[10:11] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[12:13] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[14:15] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[2:3] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[4:5] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[6:7] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[26:27] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[20:21], v[20:21] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[22:23], v[22:23] +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v52, s[34:35] offset:64 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[30:31], v[30:31] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[28:29], v[28:29] +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v52, s[34:35] offset:96 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[26:27], v[26:27] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[24:25], v[24:25] +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v52, s[34:35] offset:112 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[34:35], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[36:37], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[32:33], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[50:51], v[50:51] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[44:45], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[46:47], v[46:47] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[28:31], s[34:35] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[24:27], s[34:35] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[20:23], s[34:35] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[16:19], s[34:35] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[12:15], s[34:35] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[8:11], s[34:35] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[4:7], s[34:35] +; GFX1250-SDAG-NEXT: global_store_b128 v52, v[0:3], s[34:35] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v32_vs: @@ -2499,59 +2519,60 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: s_clause 0x3 ; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] ; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 ; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 ; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 ; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[28:29], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[30:31], s[24:25] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[22:23] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[24:25], v[24:25] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[26:27], v[26:27] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[30:31], v[30:31] +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[28:29], v[28:29] +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[32:33], v[32:33] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[34:35], v[34:35] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[44:45], v[44:45] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[46:47], v[46:47] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[36:37], v[36:37] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[38:39], v[38:39] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[48:49], v[48:49] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[50:51], v[50:51] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[40:41], v[40:41] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[42:43], v[42:43] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[48:49], v[48:49] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55] ; GFX1250-GISEL-NEXT: s_clause 0x7 ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] ; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 69983faf2b154..f079089afb1e9 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -6,7 +6,7 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr34_vgpr35_vgpr36_vgpr37 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr4 # GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index f4a9e7e8f2759..ac123e64adf55 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -238,8 +238,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 -; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 +; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b1e05158b6212..fbcf7ccb8fc55 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -365,110 +365,114 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v22, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v22, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: s_movk_i32 s0, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_mov_b32 s13, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[4:5] ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20] -; GFX8-NEXT: s_addk_i32 s1, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22] -; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2 -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc800, v2 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffd000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[11:12] +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffd800, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffe000, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffe800, v2 +; GFX8-NEXT: s_mov_b64 s[6:7], vcc +; GFX8-NEXT: s_addk_i32 s13, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s13, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e64 v11, s[10:11], v20, v18 +; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], v21, v19, s[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e64 v8, s[10:11], v8, v11 +; GFX8-NEXT: v_addc_u32_e64 v11, s[0:1], -1, v3, s[0:1] +; GFX8-NEXT: v_addc_u32_e64 v25, s[10:11], v9, v13, s[10:11] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[10:11] +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], -1, v3, s[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[12:13] +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], -1, v3, s[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[14:15] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xfffff000, v2 +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], -1, v3, s[6:7] +; GFX8-NEXT: s_mov_b64 s[8:9], vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v19, s[0:1], -1, v3, s[8:9] +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xfffff800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[20:21] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v25, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GFX8-NEXT: s_waitcnt vmcnt(5) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v11, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v23, v4 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, v24, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s1, s0, -1 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: s_add_i32 s0, s12, -1 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s0, s1 +; GFX8-NEXT: s_mov_b32 s12, s0 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v22 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[18:19] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -520,56 +524,54 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, 0xffffc000, v2 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2 -; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[15:16], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v17, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v18, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[23:24], v[15:16], off +; GFX900-NEXT: global_load_dwordx2 v[25:26], v[17:18], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[15:16], v[21:22], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[21:22], off ; GFX900-NEXT: s_addk_i32 s6, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 -; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e64 v7, s[0:1], v7, v4 +; GFX900-NEXT: v_addc_co_u32_e64 v5, s[0:1], v8, v5, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e64 v19, s[0:1], v19, v7 +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[21:22], off offset:-2048 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 +; GFX900-NEXT: v_addc_co_u32_e64 v20, s[0:1], v20, v5, s[0:1] ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 -; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off -; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v23, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v24, v20, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v25, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v26, v20, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 ; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc -; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index bfc310ad93ead..9793d257a0f35 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -70,23 +70,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v11, v8 +; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; GFX9-NEXT: v_or_b32_e32 v11, v7, v9 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v0, 0, s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_6 @@ -107,38 +107,38 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v13, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v9, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v12, v8, v12 -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v13, v9, v13 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v12, v10, v12 +; GFX9-NEXT: v_subrev_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v13, v11, v13 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v10, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v15, v11, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v12, vcc +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc @@ -148,75 +148,75 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v19, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 31, v15 -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7 -; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v9 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v14, v14, v33 -; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v32 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_or_b32_e32 v10, v18, v10 -; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 -; GFX9-NEXT: v_or_b32_e32 v11, v19, v11 -; GFX9-NEXT: v_and_b32_e32 v19, v8, v22 +; GFX9-NEXT: v_or_b32_e32 v8, v18, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 31, v15 +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] +; GFX9-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 31, v7 +; GFX9-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX9-NEXT: v_sub_co_u32_e32 v18, vcc, v28, v14 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v29, v15, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v30, v16, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v31, v17, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v32, 31, v18 +; GFX9-NEXT: v_and_b32_e32 v18, v32, v23 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 -; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v19, vcc -; GFX9-NEXT: v_and_b32_e32 v33, v8, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v32, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v33, vcc +; GFX9-NEXT: v_and_b32_e32 v18, v32, v22 +; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v18, v32, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v18, v32, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v18, vcc ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, -1, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v26, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v27, vcc +; GFX9-NEXT: v_or_b32_e32 v9, v19, v9 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v18, v24, v26 ; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_mov_b32_e32 v19, v9 +; GFX9-NEXT: v_or3_b32 v6, v6, v10, v12 +; GFX9-NEXT: v_and_b32_e32 v10, 1, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v11 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-NEXT: v_mov_b32_e32 v18, v10 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11 -; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13 -; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v15 -; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v9 +; GFX9-NEXT: v_or3_b32 v13, v7, 0, v13 +; GFX9-NEXT: v_or3_b32 v12, v6, v8, v12 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 +; GFX9-NEXT: v_mul_lo_u32 v17, v10, v5 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v10, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 +; GFX9-NEXT: v_mul_lo_u32 v16, v11, v4 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v10, v[14:15] +; GFX9-NEXT: v_add3_u32 v8, v8, v17, v16 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-NEXT: v_mov_b32_e32 v14, v9 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 +; GFX9-NEXT: v_mul_lo_u32 v12, v13, v23 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] ; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 @@ -1536,38 +1536,38 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_or_b32_e32 v10, v20, v30 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] +; GFX9-NEXT: v_or_b32_e32 v10, v20, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v11, v21, v31 -; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v21 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v20 ; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16 ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v4 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, v30, v6 +; GFX9-NEXT: v_and_b32_e32 v20, v30, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v20, v30, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v12, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 4aac193d6aeab..4d23eefa3f172 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1588,68 +1588,68 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] -; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 -; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v16, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v17, 0x260 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v16, s[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v17 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v16, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v17 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[14:15], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[20:21], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[14:15], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[16:17], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v15 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[20:21], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[16:17] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[20:21] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v21 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; SI-SDAG-NEXT: s_nop 0 @@ -1670,69 +1670,70 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v22, 0x3ff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_div_scale_f64 v[14:15], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v22 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[14:15], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[20:21], -v[4:5], v[16:17], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[20:21], v[10:11], v[16:17] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_rsq_v2f64: @@ -1888,68 +1889,68 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] -; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 -; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v16, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v17, 0x260 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v16, s[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v17 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v16, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v17 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[14:15], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[20:21], s[4:5], -1.0, v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[14:15], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[16:17], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v15 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[20:21], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[16:17] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[20:21] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v21 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-SDAG-NEXT: s_nop 0 @@ -1970,69 +1971,70 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v22, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_div_scale_f64 v[14:15], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], -1.0, v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v22 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[14:15], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[20:21], -v[4:5], v[16:17], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[20:21], v[10:11], v[16:17] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_rsq_v2f64: @@ -2243,6 +2245,7 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -2264,44 +2267,43 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] -; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[14:15], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[4:5], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[14:15], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v20 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[18:19], v[6:7] +; SI-GISEL-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[10:11], v[4:5], v[16:17] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[12:13], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[10:11], v[6:7], v[12:13] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2430,69 +2432,69 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] -; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] -; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] +; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 0.5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11] -; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] -; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 -; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_mov_b32_e32 v16, 0xffffff80 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v17, 0x260 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[10:11] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v16, s[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v10 +; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v17 +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v16, vcc +; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v17 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[14:15], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[20:21], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[14:15], v[4:5] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-SDAG-NEXT: s_mov_b32 s4, 0xbff00000 +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[16:17], v[14:15] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v15 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[20:21], v[6:7] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3ff00000 -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[16:17] +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[20:21] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v19 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v21 ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-SDAG-NEXT: s_nop 0 @@ -2517,7 +2519,9 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mov_b32_e32 v22, 0x3ff00000 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] @@ -2538,46 +2542,44 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0 ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[14:15], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v20 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[14:15], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[4:5], v[10:11], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v6 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[20:21], -v[4:5], v[16:17], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v22 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] -; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[20:21], v[10:11], v[16:17] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64: diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index c90975959c3f4..a7bd3c8b2bc6d 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -23,9 +23,9 @@ body: | ; CHECK-NEXT: liveins: $sgpr6_sgpr7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 3ca61d26e8e42..a0b5535cc6a91 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -51,8 +51,8 @@ body: | ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index c5732531f5423..84d5568f552f4 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -44,7 +44,7 @@ entry: ; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: ; TONGA: NumSgprs: 96 ; TONGA-GCNTRACKERS: NumSgprs: 96 -; TONGA: NumVgprs: 21 +; TONGA: NumVgprs: 26 ; TONGA-GCNTRACKERS: NumVgprs: 23 ; TONGA: Occupancy: 8 ; TONGA-GCNTRACKERS: Occupancy: 8 @@ -59,9 +59,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: ; GENERIC: NumSgprs: 71 -; GENERIC-GCNTRACKERS: NumSgprs: 45 -; GENERIC: NumVgprs: 20 -; GENERIC-GCNTRACKERS: NumVgprs: 20 +; GENERIC-GCNTRACKERS: NumSgprs: 35 +; GENERIC: NumVgprs: 24 +; GENERIC-GCNTRACKERS: NumVgprs: 34 ; GENERIC: Occupancy: 7 ; GENERIC-GCNTRACKERS: Occupancy: 10 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index 88e11c9ce3d1d..2c0bed669a5b2 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -17,9 +17,9 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 + ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3 @@ -27,10 +27,10 @@ body: | ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub2, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec + ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec @@ -38,7 +38,7 @@ body: | ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY4]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 undef %43.sub3:vreg_128 = COPY $vgpr9 undef %42.sub2:vreg_128 = COPY $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 7b8eba1091b48..0ab1a8edabdca 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -7,16 +7,16 @@ ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target ; CHECK-LABEL: {{^}}load_fma_store: -; OCC: NumVgprs: 24 -; OCC-GCNTRACKER: NumVgprs: 26 +; OCC: NumVgprs: 28 +; OCC-GCNTRACKER: NumVgprs: 20 ; RELAX: NumVgprs: 64 ; RELAX-GCNTRACKER: NumVgprs: 60 -; OCC: NumVGPRsForWavesPerEU: 24 -; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 26 +; OCC: NumVGPRsForWavesPerEU: 28 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 20 ; RELAX: NumVGPRsForWavesPerEU: 64 ; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 -; OCC: Occupancy: 10 -; OCC-GCNTRACKER: Occupancy: 9 +; OCC: Occupancy: 9 +; OCC-GCNTRACKER: Occupancy: 10 ; RELAX: Occupancy: 4 ; RELAX-GCNTRACKER: Occupancy: 4 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8aed56e..063153bd26f79 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -482,22 +482,54 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; ; GFX10_W64-MUBUF-LABEL: ps_main: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 @@ -505,179 +537,148 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v13, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: ps_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: ps_main: @@ -776,93 +777,93 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: ps_main: @@ -1505,22 +1506,54 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; ; GFX10_W64-MUBUF-LABEL: vs_main: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 @@ -1528,179 +1561,148 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v13, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: vs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: vs_main: @@ -1799,93 +1801,93 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: vs_main: @@ -2528,22 +2530,54 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; ; GFX10_W64-MUBUF-LABEL: cs_main: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 @@ -2551,179 +2585,150 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v13, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: cs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbe319356 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29dc +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v25, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v25 +; GFX9-FLATSCR-NEXT: scratch_load_dword v2, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v20 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:704 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(8) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: s_nop 0 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v25 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: cs_main: @@ -2822,93 +2827,95 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v25, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v2, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v20 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(8) +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: s_nop 0 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: cs_main: @@ -3549,21 +3556,53 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; ; GFX10_W64-MUBUF-LABEL: hs_main: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 @@ -3571,179 +3610,148 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v13, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: hs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: hs_main: @@ -3842,93 +3850,93 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: hs_main: @@ -4569,21 +4577,53 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; ; GFX10_W64-MUBUF-LABEL: gs_main: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 @@ -4591,179 +4631,148 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v13, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[0:3], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: gs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: gs_main: @@ -4862,93 +4871,93 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: gs_main: @@ -5597,21 +5606,53 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; ; GFX10_W64-MUBUF-LABEL: hs_ir_uses_scratch_offset: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 @@ -5619,181 +5660,150 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v12, s[8:11], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: hs_ir_uses_scratch_offset: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: hs_ir_uses_scratch_offset: @@ -5893,94 +5903,94 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset: @@ -6628,21 +6638,53 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; ; GFX10_W64-MUBUF-LABEL: gs_ir_uses_scratch_offset: ; GFX10_W64-MUBUF: ; %bb.0: +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0xb702e758 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3e31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbefcd8a3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb7043519 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3eae29d8 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf523be3 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:256 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3eae29dc +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:252 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd89f +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe319356 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v20, 0xbf20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v21, 0xbf3d349e +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:284 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:264 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:260 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:244 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:236 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:220 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 @@ -6650,181 +6692,150 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:280 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:276 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:272 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:268 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:248 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:240 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:232 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:228 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:224 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:216 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:208 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:200 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v12, s[8:11], 0 offen +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:832 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:828 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:824 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:820 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:816 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:812 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:804 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:800 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:792 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:796 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:768 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:760 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:744 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:740 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:736 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:732 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:728 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:724 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:720 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:716 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:712 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-LABEL: gs_ir_uses_scratch_offset: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: gs_ir_uses_scratch_offset: @@ -6924,94 +6935,94 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v24, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3eae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v9, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v8 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v5, 0x200, v24 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v5, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v9, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..cc60b7f450b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -498,6 +498,8 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; NOSDWA-NEXT: v_mov_b32_e32 v8, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v9, s1 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -508,21 +510,20 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; NOSDWA-NEXT: v_mul_lo_u16_e32 v12, v1, v5 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v13, v0, v4 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v4 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v6 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v5 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v4 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v4, v14, v13 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_or_b32_e32 v3, v10, v3 ; NOSDWA-NEXT: v_or_b32_e32 v2, v11, v2 ; NOSDWA-NEXT: v_or_b32_e32 v1, v12, v1 -; NOSDWA-NEXT: v_or_b32_e32 v0, v13, v0 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; NOSDWA-NEXT: s_endpgm ; @@ -873,20 +874,20 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; NOSDWA-NEXT: v_mul_f16_e32 v10, v11, v10 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10 ; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7 ; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6 -; NOSDWA-NEXT: v_mul_f16_e32 v4, v4, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; NOSDWA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index bbdfc767208cc..790b3655cbc3f 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1889,39 +1889,39 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; SI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 @@ -1932,7 +1932,7 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 ; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 ; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2039,96 +2039,96 @@ define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) { ; SI-LABEL: v_select_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v7, v22, v7 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v11, v26, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v9, v24, v25 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, v18, v19 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_or_b32_e32 v11, v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v15, v22, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc ; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v16, v7, v14, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 @@ -2174,127 +2174,130 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v33 -; SI-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; SI-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 -; SI-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 -; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[14:15] ; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[12:13] +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v29 -; SI-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v24 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[10:11] +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[8:9] +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[8:9] +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[8:9] +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[16:17] +; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[4:5] ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cndmask_b32_e64 v5, v16, v5, s[18:19] +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cndmask_b32_e64 v11, v22, v11, s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v31 +; SI-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[20:21] +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 +; SI-NEXT: v_cndmask_b32_e64 v9, v23, v9, s[8:9] +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cndmask_b32_e64 v10, v24, v10, s[8:9] +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cndmask_b32_e32 v14, v19, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2303,64 +2306,64 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v26 ; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[10:11] ; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v29 -; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26 -; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28 -; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_cndmask_b32_e64 v22, v24, v22, s[20:21] -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v28 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15] -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v26, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11] -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v27 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9] -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v29 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v29, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; VI-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30 -; VI-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v30 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc ; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; VI-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17] +; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[14:15] ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e64 v22, v26, v22, s[16:17] ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 ; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 ; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 @@ -2697,12 +2700,12 @@ define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) { ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 @@ -2860,25 +2863,25 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 +; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 @@ -2889,32 +2892,32 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[16:17] +; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[6:7] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[14:15] +; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[8:9] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[12:13] +; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[10:11] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[10:11] +; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[12:13] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[8:9] +; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[14:15] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[16:17] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 @@ -3144,12 +3147,12 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 @@ -3194,13 +3197,13 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 -; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 +; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -3209,8 +3212,8 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 -; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 +; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 ; VI-NEXT: s_waitcnt vmcnt(5) @@ -3221,8 +3224,8 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 +; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 @@ -3309,17 +3312,17 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 ; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 ; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 ; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 ; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 ; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 ; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3329,13 +3332,13 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 ; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 ; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 ; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31 +; VI-NEXT: v_or_b32_sdwa v12, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3480,9 +3483,9 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:112 ; GFX11-FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:104 ; GFX11-FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:96 ; GFX11-FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:88 @@ -3513,6 +3516,8 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v29 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v12 @@ -3537,22 +3542,20 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v100, v99, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v97, v98, v97, s0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v33 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 @@ -3586,27 +3589,25 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v148, v147, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v98, v32, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v149, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo @@ -3615,8 +3616,7 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) @@ -3624,7 +3624,7 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v33, v13, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo @@ -3633,8 +3633,7 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) @@ -3642,28 +3641,31 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v53, v1, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v98, v96, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <32 x i32> %cond, zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 3a2d056dc504a..8f5151fa2febe 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -295,34 +295,34 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_sub_i32_e32 v22, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v21, v19, v17 +; GCN-NEXT: v_subrev_i32_e32 v17, vcc, 64, v8 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_lshl_b64 v[19:20], v[0:1], v17 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_or_b32_e32 v16, v18, v16 +; GCN-NEXT: s_and_b64 vcc, vcc, s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] -; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc -; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v12 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 -; GCN-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 -; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v22 +; GCN-NEXT: v_cndmask_b32_e64 v3, v20, v3, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_or_b32_e32 v16, v16, v18 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] +; GCN-NEXT: v_or_b32_e32 v11, v17, v19 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 ; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7] @@ -344,34 +344,34 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_sub_i32_e32 v22, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v21, v19, v17 +; GCN-NEXT: v_subrev_i32_e32 v17, vcc, 64, v8 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_lshr_b64 v[19:20], v[2:3], v17 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_or_b32_e32 v16, v18, v16 +; GCN-NEXT: s_and_b64 vcc, vcc, s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] -; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc -; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v12 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] -; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 -; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_lshl_b64 v[18:19], v[6:7], v22 +; GCN-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_or_b32_e32 v16, v16, v18 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] +; GCN-NEXT: v_or_b32_e32 v11, v17, v19 ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] @@ -393,34 +393,34 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_sub_i32_e32 v22, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v21, v19, v17 +; GCN-NEXT: v_subrev_i32_e32 v17, vcc, 64, v8 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_ashr_i64 v[19:20], v[2:3], v17 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_or_b32_e32 v16, v18, v16 +; GCN-NEXT: s_and_b64 vcc, vcc, s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] -; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc -; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v12 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] -; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 -; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_lshl_b64 v[18:19], v[6:7], v22 +; GCN-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_or_b32_e32 v16, v16, v18 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] +; GCN-NEXT: v_or_b32_e32 v11, v17, v19 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] ; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 28330bfc9bb69..ffae40c9514e9 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -911,20 +911,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshl_b64 v[5:6], v[5:6], v2 -; SI-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll index dd42a1dd44320..ff24f1d3ac6f7 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 7ee7c83e0122d..38c6723671974 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -295,10 +295,10 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -591,10 +591,10 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -635,17 +635,17 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[1:16] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v21, 0 ; GFX900-NEXT: v_mov_b32_e32 v17, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v18, v0 -; GFX900-NEXT: v_mov_b32_e32 v19, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, v16 +; GFX900-NEXT: v_mov_b32_e32 v19, v0 +; GFX900-NEXT: v_mov_b32_e32 v20, v1 +; GFX900-NEXT: global_store_dwordx4 v21, v[17:20], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,8 +703,8 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v18 -; GFX900-NEXT: v_mov_b32_e32 v1, v19 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v19 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -759,8 +759,8 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v20 -; GFX900-NEXT: v_mov_b32_e32 v3, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -815,8 +815,8 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v22 -; GFX900-NEXT: v_mov_b32_e32 v5, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v23 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -871,8 +871,8 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v24 -; GFX900-NEXT: v_mov_b32_e32 v7, v25 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v25 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -927,8 +927,8 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v26 -; GFX900-NEXT: v_mov_b32_e32 v9, v27 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v27 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -983,8 +983,8 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v28 -; GFX900-NEXT: v_mov_b32_e32 v11, v29 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v29 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1039,8 +1039,8 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v30 -; GFX900-NEXT: v_mov_b32_e32 v13, v31 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v31 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1091,12 +1091,12 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1465,10 +1465,10 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1825,12 +1825,12 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1876,10 +1876,10 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1922,12 +1922,12 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1978,12 +1978,12 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2034,12 +2034,12 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2090,12 +2090,12 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v12, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v13, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,12 +2146,12 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v14, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v15, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2202,12 +2202,12 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2687,15 +2687,15 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2743,15 +2743,15 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2799,15 +2799,15 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v12, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v13, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,15 +2855,15 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v14, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v15, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2911,15 +2911,15 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2967,15 +2967,15 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v2 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3455,15 +3455,15 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3511,15 +3511,15 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3567,15 +3567,15 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v14, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v15, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3623,15 +3623,15 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3679,15 +3679,15 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,15 +3735,15 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v4 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4223,15 +4223,15 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4279,15 +4279,15 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4335,15 +4335,15 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4391,15 +4391,15 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4447,15 +4447,15 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4503,15 +4503,15 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v22, v6 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v23, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4991,15 +4991,15 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5047,15 +5047,15 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5103,15 +5103,15 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5159,15 +5159,15 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5215,15 +5215,15 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v22, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v23, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5271,15 +5271,15 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v24, v8 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v25, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,15 +5759,15 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v16, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v17, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5815,15 +5815,15 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5871,15 +5871,15 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5927,15 +5927,15 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v22, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v23, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5983,15 +5983,15 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v24, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v25, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6039,15 +6039,15 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v26, v10 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v27, v11 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6527,15 +6527,15 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v18, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v19, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6583,15 +6583,15 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6639,15 +6639,15 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v22, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v23, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6695,15 +6695,15 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v24, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v25, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6751,15 +6751,15 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v26, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v27, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6807,15 +6807,15 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v28, v12 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v29, v13 +; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7295,15 +7295,15 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v20, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v21, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7351,15 +7351,15 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v22, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v23, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,15 +7407,15 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v24, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v25, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7463,15 +7463,15 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v26, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v27, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7519,15 +7519,15 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v28, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v29, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7575,15 +7575,15 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v30, v14 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v31, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[28:31], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7917,10 +7917,10 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8293,8 +8293,8 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8349,8 +8349,8 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8405,8 +8405,8 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8461,8 +8461,8 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8517,8 +8517,8 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8573,8 +8573,8 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8629,8 +8629,8 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8685,8 +8685,8 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v16 -; GFX900-NEXT: v_mov_b32_e32 v1, v17 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v17 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9097,8 +9097,8 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9153,8 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9209,8 +9209,8 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9265,8 +9265,8 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v12 -; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9321,8 +9321,8 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9377,8 +9377,8 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9433,8 +9433,8 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v18 -; GFX900-NEXT: v_mov_b32_e32 v15, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9489,8 +9489,8 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v16 -; GFX900-NEXT: v_mov_b32_e32 v3, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9901,8 +9901,8 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9957,8 +9957,8 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10013,8 +10013,8 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10069,8 +10069,8 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10125,8 +10125,8 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10181,8 +10181,8 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v18 -; GFX900-NEXT: v_mov_b32_e32 v13, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10237,8 +10237,8 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v20 -; GFX900-NEXT: v_mov_b32_e32 v15, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10293,8 +10293,8 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10705,8 +10705,8 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10761,8 +10761,8 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v12 -; GFX900-NEXT: v_mov_b32_e32 v5, v13 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10817,8 +10817,8 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v14 -; GFX900-NEXT: v_mov_b32_e32 v7, v15 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10873,8 +10873,8 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10929,8 +10929,8 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v18 -; GFX900-NEXT: v_mov_b32_e32 v11, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10985,8 +10985,8 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v20 -; GFX900-NEXT: v_mov_b32_e32 v13, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11041,8 +11041,8 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v22 -; GFX900-NEXT: v_mov_b32_e32 v15, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v23 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11097,8 +11097,8 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11509,8 +11509,8 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11565,8 +11565,8 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11621,8 +11621,8 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11677,8 +11677,8 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v18 -; GFX900-NEXT: v_mov_b32_e32 v9, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11733,8 +11733,8 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v20 -; GFX900-NEXT: v_mov_b32_e32 v11, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11789,8 +11789,8 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v22 -; GFX900-NEXT: v_mov_b32_e32 v13, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v23 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11845,8 +11845,8 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v24 -; GFX900-NEXT: v_mov_b32_e32 v15, v25 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v25 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11901,8 +11901,8 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12313,8 +12313,8 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12369,8 +12369,8 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12425,8 +12425,8 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v18 -; GFX900-NEXT: v_mov_b32_e32 v7, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12481,8 +12481,8 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v20 -; GFX900-NEXT: v_mov_b32_e32 v9, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12537,8 +12537,8 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v22 -; GFX900-NEXT: v_mov_b32_e32 v11, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v23 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12593,8 +12593,8 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v24 -; GFX900-NEXT: v_mov_b32_e32 v13, v25 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v25 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12649,8 +12649,8 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v26 -; GFX900-NEXT: v_mov_b32_e32 v15, v27 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v27 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -12705,8 +12705,8 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13117,8 +13117,8 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v16 -; GFX900-NEXT: v_mov_b32_e32 v3, v17 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v17 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13173,8 +13173,8 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v18 -; GFX900-NEXT: v_mov_b32_e32 v5, v19 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v19 ; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13229,8 +13229,8 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v20 -; GFX900-NEXT: v_mov_b32_e32 v7, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v21 ; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13285,8 +13285,8 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v22 -; GFX900-NEXT: v_mov_b32_e32 v9, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v23 ; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13341,8 +13341,8 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v24 -; GFX900-NEXT: v_mov_b32_e32 v11, v25 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v25 ; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13397,8 +13397,8 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v26 -; GFX900-NEXT: v_mov_b32_e32 v13, v27 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v27 ; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13453,8 +13453,8 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, v28 -; GFX900-NEXT: v_mov_b32_e32 v15, v29 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v15, v29 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -13509,8 +13509,8 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v17 ; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll index ae31524ebaa7f..4bcd924493d78 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll index 6e156d2d4a2f5..674f46587b957 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1643,17 +1643,17 @@ define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1877,17 +1877,19 @@ define void @v_shuffle_v3i64_v4i64__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2557,16 +2559,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,19 +2737,19 @@ define void @v_shuffle_v3i64_v4i64__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3420,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4285,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5031,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5881,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6737,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7544,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll index b03066e66cf66..994138f4c4d42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1643,17 +1643,17 @@ define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1877,17 +1877,19 @@ define void @v_shuffle_v3p0_v4p0__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2557,16 +2559,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,19 +2737,19 @@ define void @v_shuffle_v3p0_v4p0__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3420,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4285,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5031,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5881,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6737,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7544,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index ea9ef2f1ac94a..b5ceb609ab5c8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2198,13 +2198,12 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,18 +2261,17 @@ define void @v_shuffle_v4i64_v4i64__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2465,19 +2463,18 @@ define void @v_shuffle_v4i64_v4i64__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2736,16 +2733,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2931,18 +2928,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,18 +3131,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3704,9 +3701,9 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] @@ -3765,18 +3762,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3972,18 +3969,18 @@ define void @v_shuffle_v4i64_v4i64__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5185,9 +5182,9 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 ; GFX900-NEXT: v_mov_b32_e32 v1, v13 ; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] @@ -5246,18 +5243,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5453,9 +5450,9 @@ define void @v_shuffle_v4i64_v4i64__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: v_mov_b32_e32 v2, v8 ; GFX900-NEXT: v_mov_b32_e32 v3, v9 @@ -6658,9 +6655,9 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -6718,18 +6715,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6849,18 @@ define void @v_shuffle_v4i64_v4i64__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6929,9 +6925,9 @@ define void @v_shuffle_v4i64_v4i64__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: v_mov_b32_e32 v2, v8 ; GFX900-NEXT: v_mov_b32_e32 v3, v9 @@ -8043,18 +8039,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8713,18 +8709,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9476,18 +9472,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10907,18 +10903,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12302,18 +12298,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index ce1c54129f706..7c1117c96988a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2198,13 +2198,12 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,18 +2261,17 @@ define void @v_shuffle_v4p0_v4p0__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2465,19 +2463,18 @@ define void @v_shuffle_v4p0_v4p0__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2736,16 +2733,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2931,18 +2928,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,18 +3131,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3704,9 +3701,9 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] @@ -3765,18 +3762,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3972,18 +3969,18 @@ define void @v_shuffle_v4p0_v4p0__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5185,9 +5182,9 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 ; GFX900-NEXT: v_mov_b32_e32 v1, v13 ; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] @@ -5246,18 +5243,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5453,9 +5450,9 @@ define void @v_shuffle_v4p0_v4p0__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: v_mov_b32_e32 v2, v8 ; GFX900-NEXT: v_mov_b32_e32 v3, v9 @@ -6658,9 +6655,9 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] @@ -6718,18 +6715,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6849,18 @@ define void @v_shuffle_v4p0_v4p0__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6929,9 +6925,9 @@ define void @v_shuffle_v4p0_v4p0__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 ; GFX900-NEXT: v_mov_b32_e32 v2, v8 ; GFX900-NEXT: v_mov_b32_e32 v3, v9 @@ -8043,18 +8039,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8713,18 +8709,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9476,18 +9472,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10907,18 +10903,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12302,18 +12298,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index 71e4755b58bf2..49213a47dbed9 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,13 +3,15 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader + +; CHECK: global_load_dword + ; CHECK: s_load_dwordx16 ; CHECK-NEXT: s_load_dwordx16 ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword -; CHECK-NEXT: global_load_dword ; CHECK: s_load_dwordx16 ; CHECK-NEXT: s_load_dwordx16 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index eb0d5465cacd9..8f1c71cc3f5af 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -191,83 +191,82 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX908-LABEL: max_32regs_mfma32: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x40c00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x40e00000 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x40a00000 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41100000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41200000 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a9, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41300000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41400000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41500000 -; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a15, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41880000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41900000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41980000 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a00000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41a80000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41b00000 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a21, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41b80000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41c80000 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d00000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41d80000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41e00000 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a27, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41e80000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41f80000 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41200000 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41300000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41400000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41500000 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41600000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41880000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41900000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41980000 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41a00000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41b00000 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41b80000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41c00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c80000 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41d00000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41e00000 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41e80000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41f00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f80000 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 4.0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a30, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 2.0 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v0, a[0:31] +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v4 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: global_store_dword v0, v1, s[2:3] +; GFX908-NEXT: global_store_dword v1, v0, s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: max_32regs_mfma32: @@ -331,7 +330,8 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 @@ -339,13 +339,10 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dword v0, a0, s[2:3] ; GFX90A-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 80c0d0f45eb97..a843b4cfb3f0d 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -602,46 +602,46 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_ashr_i64 v[5:6], v[5:6], v2 -; SI-NEXT: v_ashr_i64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 -; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 +; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_ashrrev_i64 v[5:6], v2, v[5:6] -; VI-NEXT: v_ashrrev_i64 v[3:4], v0, v[3:4] +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] -; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] -; VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] +; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] +; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index f614f58d8e1dc..d40bdeb46a017 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -3365,96 +3365,96 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v0, v8 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v14, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v15, v9, v8 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v15 ; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v1 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v14 -; TONGA-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; TONGA-NEXT: v_sub_u32_e32 v18, vcc, 0, v15 +; TONGA-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc ; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 ; TONGA-NEXT: v_rcp_f32_e32 v8, v8 ; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 ; TONGA-NEXT: v_trunc_f32_e32 v9, v9 ; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v8 -; TONGA-NEXT: v_mul_lo_u32 v10, v15, v12 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v13, 0 -; TONGA-NEXT: v_mul_lo_u32 v11, v16, v13 +; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v17, v8 +; TONGA-NEXT: v_mul_lo_u32 v10, v18, v16 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v18, v17, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v19, v17 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v10 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v9, v11 -; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v13, v11, 0 -; TONGA-NEXT: v_mul_hi_u32 v17, v13, v8 -; TONGA-NEXT: v_add_u32_e32 v17, vcc, v17, v9 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v12, v8, 0 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v17, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v18, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v17, v11, 0 +; TONGA-NEXT: v_mul_hi_u32 v20, v17, v8 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v16, v8, 0 +; TONGA-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v16, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v20, v9 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v13 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v11 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v17, vcc, v13, v8 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v12, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v17, 0 -; TONGA-NEXT: v_mul_lo_u32 v12, v15, v18 -; TONGA-NEXT: v_mul_lo_u32 v13, v16, v17 -; TONGA-NEXT: v_mul_hi_u32 v15, v17, v8 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v8, 0 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v8 +; TONGA-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v18, v14, 0 +; TONGA-NEXT: v_mul_lo_u32 v12, v18, v16 +; TONGA-NEXT: v_mul_lo_u32 v13, v19, v14 +; TONGA-NEXT: v_mul_hi_u32 v17, v14, v8 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v16, v8, 0 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v13 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v17, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v18, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v15, v12 +; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v9, 0 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v17, v12 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v10 ; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v17, v8 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v18, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v14, v8 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v16, v9, vcc ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v4, v12 ; TONGA-NEXT: v_xor_b32_e32 v13, v8, v12 ; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v13, v11, 0 -; TONGA-NEXT: v_mul_hi_u32 v15, v13, v10 +; TONGA-NEXT: v_mul_hi_u32 v14, v13, v10 ; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v5, v12, vcc ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v12 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v15, v8 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v14, v8 ; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v9, vcc ; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v10, 0 ; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v15, v8 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v14, v8 ; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v8, v10 ; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; TONGA-NEXT: v_mul_lo_u32 v11, v14, v8 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v10, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v15, v8 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v10, 0 ; TONGA-NEXT: v_mul_lo_u32 v10, v1, v10 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v9 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v5, v9 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v13, v8 ; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, vcc -; TONGA-NEXT: v_sub_u32_e64 v11, s[0:1], v8, v14 +; TONGA-NEXT: v_sub_u32_e64 v11, s[0:1], v8, v15 ; TONGA-NEXT: v_subbrev_u32_e64 v13, s[2:3], 0, v10, s[0:1] ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v13, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v14 +; TONGA-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v15 ; TONGA-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] ; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v13, v1 ; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v16, s[0:1], v11, v14 +; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[2:3] +; TONGA-NEXT: v_sub_u32_e64 v16, s[0:1], v11, v15 ; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc ; TONGA-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1] ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v14 ; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v14 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v15 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v13, v10, s[0:1] ; TONGA-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -3500,52 +3500,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v12, v3, v0 ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 ; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v12 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; TONGA-NEXT: v_subb_u32_e32 v14, vcc, 0, v12, vcc +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5 +; TONGA-NEXT: v_subb_u32_e32 v16, vcc, 0, v12, vcc ; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1 ; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v0 -; TONGA-NEXT: v_mul_lo_u32 v3, v13, v10 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v13, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v4, v14, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v0 +; TONGA-NEXT: v_mul_lo_u32 v3, v15, v13 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v15, v14, 0 +; TONGA-NEXT: v_mul_lo_u32 v4, v16, v14 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v1, v4 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v11, v15, 0 -; TONGA-NEXT: v_mul_hi_u32 v1, v11, v0 -; TONGA-NEXT: v_add_u32_e32 v16, vcc, v1, v3 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v0, 0 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v10, v15, 0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v16, v0 -; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v17, v1, vcc -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v14, v1, 0 +; TONGA-NEXT: v_mul_hi_u32 v17, v14, v0 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v1, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v13, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v17, v3 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v4, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v10 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v11, v0 -; TONGA-NEXT: v_addc_u32_e32 v16, vcc, v10, v1, vcc -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v13, v15, 0 -; TONGA-NEXT: v_mul_lo_u32 v10, v13, v16 -; TONGA-NEXT: v_mul_lo_u32 v11, v14, v15 -; TONGA-NEXT: v_mul_hi_u32 v13, v15, v0 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v16, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v15, v14, 0 +; TONGA-NEXT: v_mul_lo_u32 v10, v15, v13 +; TONGA-NEXT: v_mul_lo_u32 v11, v16, v14 +; TONGA-NEXT: v_mul_hi_u32 v15, v14, v0 +; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v13, v0, 0 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v11 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v15, v1, 0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v16, v1, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v13, v10 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v14, v1, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v13, v1, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v15, v10 ; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v10, v3 ; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v15, v0 -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v16, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v14, v0 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v13, v1, vcc ; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v11 ; TONGA-NEXT: v_xor_b32_e32 v10, v0, v11 @@ -6121,104 +6121,104 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 ; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 -; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 -; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc +; TONGA-NEXT: v_sub_u32_e32 v24, vcc, 0, v9 +; TONGA-NEXT: v_subb_u32_e32 v25, vcc, 0, v8, vcc ; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 ; TONGA-NEXT: v_rcp_f32_e32 v11, v11 ; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 ; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 ; TONGA-NEXT: v_trunc_f32_e32 v18, v18 ; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v23, v18 ; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 +; TONGA-NEXT: v_mul_lo_u32 v20, v24, v23 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v24, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v21, v25, v11 ; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 -; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 -; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 -; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 -; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23 -; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 -; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v27, vcc, v19, v21 +; TONGA-NEXT: v_mul_hi_u32 v26, v11, v18 +; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v27, 0 +; TONGA-NEXT: v_mad_u64_u32 v[21:22], s[0:1], v23, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v26, v19 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v21 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v27, 0 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v20, v22, vcc ; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 ; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 -; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 +; TONGA-NEXT: v_addc_u32_e32 v26, vcc, v23, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v24, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v20, v24, v26 +; TONGA-NEXT: v_mul_lo_u32 v21, v25, v11 +; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 +; TONGA-NEXT: v_mad_u64_u32 v[23:24], s[0:1], v26, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 +; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 +; TONGA-NEXT: v_mad_u64_u32 v[21:22], s[0:1], v26, v21, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v19 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v23 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v19, v24, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v22, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v21 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v22, vcc, v26, v19, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v23 +; TONGA-NEXT: v_xor_b32_e32 v24, v18, v23 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v24, v22, 0 +; TONGA-NEXT: v_mul_hi_u32 v25, v24, v11 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v23, vcc +; TONGA-NEXT: v_xor_b32_e32 v15, v15, v23 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v25, v18 ; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v22, 0 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v21, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v19, vcc ; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 ; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 ; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 ; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 ; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 -; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 +; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v24, v18 ; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc ; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 ; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 ; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc ; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] ; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 ; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v22, v22, v24, s[2:3] ; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 ; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 ; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] ; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v22 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc ; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] ; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] ; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 -; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v23 +; TONGA-NEXT: v_xor_b32_e32 v11, v8, v23 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v23 +; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v23, vcc ; TONGA-NEXT: s_cbranch_execnz .LBB12_3 ; TONGA-NEXT: .LBB12_2: ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 @@ -8883,43 +8883,43 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 30, v9 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GCN-NEXT: v_add_co_u32_e32 v9, vcc, v0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 30, v10 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v2, v10 +; GCN-NEXT: v_add_co_u32_e32 v9, vcc, v0, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 30, v11 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v3, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v4, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v6, v12 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v7, vcc +; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v5, vcc ; GCN-NEXT: v_and_b32_e32 v9, -4, v9 +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; GCN-NEXT: v_and_b32_e32 v11, -4, v11 ; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_and_b32_e32 v10, -4, v10 +; GCN-NEXT: v_sub_co_u32_e64 v4, s[0:1], v4, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 30, v10 ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v13, vcc -; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_and_b32_e32 v11, -4, v11 -; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v14, vcc -; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v11 -; GCN-NEXT: v_and_b32_e32 v12, -4, v12 -; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v15, vcc -; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v12 -; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v16, vcc -; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: v_lshrrev_b32_e32 v11, 30, v12 +; GCN-NEXT: v_add_co_u32_e64 v9, s[2:3], v2, v9 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v6, v11 +; GCN-NEXT: v_and_b32_e32 v9, -4, v9 +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v7, vcc +; GCN-NEXT: v_addc_co_u32_e64 v10, s[2:3], 0, v3, s[2:3] +; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v9 +; GCN-NEXT: v_and_b32_e32 v9, -4, v11 +; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v10, vcc +; GCN-NEXT: v_subb_co_u32_e64 v5, vcc, v5, v14, s[0:1] +; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v9 +; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i64_4: @@ -8972,52 +8972,52 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: v_mov_b32_e32 v0, s2 -; TONGA-NEXT: v_mov_b32_e32 v1, s3 +; TONGA-NEXT: s_add_u32 s0, s6, 16 +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; TONGA-NEXT: s_add_u32 s4, s2, 16 -; TONGA-NEXT: s_addc_u32 s5, s3, 0 -; TONGA-NEXT: v_mov_b32_e32 v4, s4 -; TONGA-NEXT: v_mov_b32_e32 v5, s5 +; TONGA-NEXT: v_mov_b32_e32 v5, s1 +; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: v_mov_b32_e32 v9, s1 -; TONGA-NEXT: v_mov_b32_e32 v8, s0 -; TONGA-NEXT: s_add_u32 s0, s0, 16 -; TONGA-NEXT: s_addc_u32 s1, s1, 0 -; TONGA-NEXT: v_mov_b32_e32 v11, s1 -; TONGA-NEXT: v_mov_b32_e32 v10, s0 +; TONGA-NEXT: v_mov_b32_e32 v9, s5 +; TONGA-NEXT: v_mov_b32_e32 v8, s4 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 +; TONGA-NEXT: v_lshrrev_b32_e32 v10, 30, v10 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v0, v10 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 -; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v4, v12 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc +; TONGA-NEXT: v_and_b32_e32 v10, -4, v10 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v10 +; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v7 ; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13 +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc +; TONGA-NEXT: v_lshrrev_b32_e32 v10, 30, v10 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; TONGA-NEXT: v_sub_u32_e64 v4, s[0:1], v4, v12 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v6, v10 +; TONGA-NEXT: v_addc_u32_e32 v12, vcc, 0, v7, vcc +; TONGA-NEXT: v_subb_u32_e64 v5, vcc, v5, v14, s[0:1] +; TONGA-NEXT: v_lshrrev_b32_e32 v11, 30, v11 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v2, v11 +; TONGA-NEXT: s_add_u32 s0, s4, 16 +; TONGA-NEXT: v_and_b32_e32 v10, -4, v10 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 -; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc -; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 -; TONGA-NEXT: v_and_b32_e32 v13, -4, v13 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 -; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; TONGA-NEXT: s_addc_u32 s1, s5, 0 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v10 +; TONGA-NEXT: v_and_b32_e32 v14, -4, v11 +; TONGA-NEXT: v_mov_b32_e32 v11, s1 +; TONGA-NEXT: v_mov_b32_e32 v10, s0 +; TONGA-NEXT: v_sub_u32_e64 v2, s[0:1], v2, v14 +; TONGA-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v13, s[0:1] +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v12, vcc ; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index c05f341f9e910..2f1057277c472 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], v2 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 09c0e775f783d..f1c8335463b44 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -723,92 +723,92 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 +; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, -v17, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, -v17, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 +; GFX6-NEXT: v_cndmask_b32_e64 v5, v16, -v5, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v16, -v6, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v7, v16, -v7, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v16, -v8, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 +; GFX6-NEXT: v_cndmask_b32_e64 v9, v16, -v9, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX6-NEXT: v_sub_i32_e64 v19, s[8:9], v4, v20 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v20 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[8:9], v19, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v10, v16, -v10, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 +; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 +; GFX6-NEXT: v_sub_i32_e64 v17, s[8:9], v12, v28 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v19, -v20, s[6:7] +; GFX6-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v28 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[8:9], v17, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 +; GFX6-NEXT: v_cndmask_b32_e64 v11, v16, -v11, s[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] ; GFX6-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 +; GFX6-NEXT: v_cndmask_b32_e64 v13, v16, -v13, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v18 +; GFX6-NEXT: v_sub_i32_e64 v18, s[8:9], v15, v18 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[8:9], v18, v15 ; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] +; GFX6-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v14, v16, -v17, s[4:5] +; GFX6-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v15, v18, -v19, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: @@ -826,92 +826,92 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 +; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, -v17, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, -v17, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v16, -v5, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v16, -v6, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v16, -v7, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, -v8, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v16, -v9, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX8-NEXT: v_sub_u32_e64 v19, s[8:9], v4, v20 +; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v20 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[8:9], v19, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v16, -v10, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 +; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 +; GFX8-NEXT: v_sub_u32_e64 v17, s[8:9], v12, v28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, -v20, s[6:7] +; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v28 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[8:9], v17, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v16, -v11, s[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 +; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v16, -v13, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v18 +; GFX8-NEXT: v_sub_u32_e64 v18, s[8:9], v15, v18 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[8:9], v18, v15 ; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] +; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v16, -v17, s[4:5] +; GFX8-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v15, v18, -v19, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v16i32: diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 9cb22dad86b88..991b2b464a067 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -32,7 +32,6 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 { ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: ; ScratchSize: 144 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 @@ -71,7 +70,6 @@ define void @needs_align16_stack_align4(i32 %idx) #2 { ; GCN-NEXT: s_mov_b32 s34, s5 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: ; ScratchSize: 160 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 @@ -111,7 +109,6 @@ define void @needs_align32(i32 %idx) #0 { ; GCN-NEXT: s_mov_b32 s34, s5 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: ; ScratchSize: 192 %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 @@ -138,7 +135,6 @@ define void @force_realign4(i32 %idx) #1 { ; GCN-NEXT: s_mov_b32 s34, s5 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: ; ScratchSize: 52 %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile i32 3, ptr addrspace(5) %gep0, align 4 @@ -295,13 +291,13 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: v_writelane_b32 v40, s34, 3 ; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; GCN-NEXT: s_add_i32 s32, s32, 0x30000 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 @@ -309,10 +305,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 5c113d80a9c80..d9ca2a2bf3f35 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -894,100 +894,100 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 -; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: s_mov_b32 s15, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] +; GFX6-NEXT: s_mov_b64 s[12:13], s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX6-NEXT: v_mov_b32_e32 v13, 0 -; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] +; GFX6-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[0:3], 0 addr64 offset:16 ; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s8, s0 -; GFX6-NEXT: s_mov_b32 s9, s1 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: s_waitcnt vmcnt(2) ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v15, v11, vcc -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v13, v9, vcc -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: v_sub_i32_e64 v10, s[0:1], v14, v10 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX6-NEXT: v_subb_u32_e64 v11, vcc, v15, v11, s[0:1] +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 +; GFX6-NEXT: v_sub_i32_e64 v0, s[0:1], v0, v4 +; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v13, v9, vcc +; GFX6-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v5, s[0:1] +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, s2, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s10, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v8 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[12:13] ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v12 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: s_add_u32 s2, s8, 16 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], v6, v14 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 +; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v15, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], v4, v12 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_subb_u32_e64 v5, vcc, v5, v13, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:16 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[10:11] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[10:11] +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e64 v10, s[0:1], v10, v14 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v15, s[0:1] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[0:1], v8, v12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc -; GFX9-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v13, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[8:11], s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_sub_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index 923017400adb1..bfd2a449d5392 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -541,12 +541,10 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_uaddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_not_b32_e32 v31, v16 -; GFX6-NEXT: v_min_u32_e32 v0, v0, v31 ; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX6-NEXT: v_not_b32_e32 v32, v16 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v32 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v17 -; GFX6-NEXT: v_min_u32_e32 v1, v1, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v18 ; GFX6-NEXT: v_min_u32_e32 v2, v2, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v19 @@ -566,14 +564,16 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_not_b32_e32 v16, v26 ; GFX6-NEXT: v_min_u32_e32 v10, v10, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v27 +; GFX6-NEXT: v_not_b32_e32 v33, v17 ; GFX6-NEXT: v_min_u32_e32 v11, v11, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v28 -; GFX6-NEXT: v_min_u32_e32 v12, v12, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v29 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v33 ; GFX6-NEXT: v_min_u32_e32 v13, v13, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v30 -; GFX6-NEXT: v_min_u32_e32 v14, v14, v16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v17 +; GFX6-NEXT: v_not_b32_e32 v17, v28 +; GFX6-NEXT: v_min_u32_e32 v14, v14, v16 +; GFX6-NEXT: v_min_u32_e32 v12, v12, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v18 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v19 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v20 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 063c56faf9ce4..210b354ba9042 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -628,18 +628,18 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_u32_e32 v8, v0 ; SI-NEXT: v_cvt_f32_u32_e32 v10, v1 @@ -695,27 +695,27 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0 +; SI-NEXT: v_sub_i32_e32 v14, vcc, v5, v1 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1 +; SI-NEXT: v_sub_i32_e32 v13, vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] ; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] -; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v9 -; SI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; SI-NEXT: v_cndmask_b32_e64 v5, v5, v14, s[2:3] +; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v9 +; SI-NEXT: v_cndmask_b32_e64 v6, v6, v13, s[4:5] +; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v10 ; SI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v11 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v13, vcc ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 @@ -723,18 +723,18 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_u32_e32 v8, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v10, v1 @@ -790,27 +790,27 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0 +; VI-NEXT: v_sub_u32_e32 v14, vcc, v5, v1 ; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1 +; VI-NEXT: v_sub_u32_e32 v13, vcc, v6, v2 ; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2 ; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] ; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3 ; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] -; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v9 -; VI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v10 +; VI-NEXT: v_cndmask_b32_e64 v5, v5, v14, s[2:3] +; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v9 +; VI-NEXT: v_cndmask_b32_e64 v6, v6, v13, s[4:5] +; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v10 ; VI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v11 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v10, v13, vcc ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 @@ -818,99 +818,99 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s2, 16 -; GCN-NEXT: s_addc_u32 s5, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s0, s10, 16 +; GCN-NEXT: s_addc_u32 s1, s11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: flat_load_dwordx4 v[5:8], v[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 -; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 -; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 -; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 -; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 -; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 -; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 -; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 -; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 -; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 -; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 +; GCN-NEXT: v_cvt_f32_u32_e32 v9, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v11, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v13, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v15, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 +; GCN-NEXT: v_rcp_iflag_f32_e32 v15, v15 +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GCN-NEXT: v_mul_f32_e32 v11, 0x4f7ffffe, v11 +; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 +; GCN-NEXT: v_mul_f32_e32 v15, 0x4f7ffffe, v15 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_u32_f32_e32 v15, v15 +; GCN-NEXT: v_sub_u32_e32 v10, vcc, 0, v0 +; GCN-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, 0, v2 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, 0, v3 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v11 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v13 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v15 +; GCN-NEXT: v_mul_hi_u32 v10, v9, v10 +; GCN-NEXT: v_mul_hi_u32 v12, v11, v12 +; GCN-NEXT: v_mul_hi_u32 v14, v13, v14 +; GCN-NEXT: v_mul_hi_u32 v16, v15, v16 +; GCN-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; GCN-NEXT: v_add_u32_e32 v10, vcc, v11, v12 +; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v14 +; GCN-NEXT: v_add_u32_e32 v12, vcc, v15, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 ; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 -; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 -; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 -; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 -; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 -; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 -; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 -; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19 -; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 -; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v13, v9, v0 +; GCN-NEXT: v_mul_lo_u32 v15, v10, v1 +; GCN-NEXT: v_mul_lo_u32 v17, v11, v2 +; GCN-NEXT: v_mul_lo_u32 v19, v12, v3 +; GCN-NEXT: v_sub_u32_e32 v13, vcc, v5, v13 +; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v15 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v19 +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v9 ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] -; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5] +; GCN-NEXT: v_add_u32_e32 v18, vcc, 1, v11 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v13, v0 +; GCN-NEXT: v_add_u32_e32 v20, vcc, 1, v12 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v6, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v13, v0 +; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v16, s[2:3] +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v6, v1 +; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v18, s[4:5] +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v7, v2 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v3 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v20, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v9 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v15, s[2:3] +; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[4:5] +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[6:7] ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v13, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v14, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v11, v16, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_cndmask_b32_e32 v3, v12, v17, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll index 334215125f58a..e75ae0f721707 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll @@ -2927,9 +2927,9 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v26 +; GFX7-SDAG-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v18 -; GFX7-SDAG-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_addc_u32_e32 v3, vcc, v3, v19, vcc ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v6, v22 ; GFX7-SDAG-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc @@ -2949,7 +2949,7 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v14, v30 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_addc_u32_e32 v9, vcc, v15, v18, vcc +; GFX7-SDAG-NEXT: v_addc_u32_e32 v9, vcc, v15, v26, vcc ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX7-SDAG-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -3001,9 +3001,9 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_add_u32_e32 v10, vcc, v10, v26 +; GFX8-SDAG-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc ; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v2, v18 -; GFX8-SDAG-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_addc_u32_e32 v3, vcc, v3, v19, vcc ; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v6, v22 ; GFX8-SDAG-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc @@ -3023,7 +3023,7 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GFX8-SDAG-NEXT: v_add_u32_e32 v8, vcc, v14, v30 ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_addc_u32_e32 v9, vcc, v15, v18, vcc +; GFX8-SDAG-NEXT: v_addc_u32_e32 v9, vcc, v15, v26, vcc ; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-SDAG-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 98919f565d902..c25c8b9627486 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -3059,32 +3059,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v13, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v12, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v15, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v3, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v4, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v8, v5, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v9, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3130,32 +3127,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v13, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v12, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v15, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v3, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v4, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v8, v5, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v9, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3459,80 +3453,80 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v16i64: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v32, v3, v18 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v33, v3, v18 ; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v27, v10, v27 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v33, v11, v26 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v26, 0 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v10, v26, 0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v19 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 -; GFX7-SDAG-NEXT: v_add_i32_e32 v32, vcc, v3, v32 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v14, v30, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v30, v4, v21 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v35, vcc, v3, v33 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v10, v27 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v11, v26 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v21, v4, v21 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v5, v20 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v29, v12, v29 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v34, v13, v28 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v12, v28, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v27 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v33 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v30 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v10, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v11 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v13, v29 -; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, v4, v5 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v23, v6, v23 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v26, v7, v22 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v32, v3 +; GFX7-SDAG-NEXT: v_add_i32_e32 v26, vcc, v3, v10 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v6, v23 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v23, v7, v22 ; GFX7-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v22, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v32, v10 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v23 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v18, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v26 -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v21, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v7, v18 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v34 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v22, v12, v29 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v14, v30, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v3, v23 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 +; GFX7-SDAG-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 +; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v21 +; GFX7-SDAG-NEXT: v_add_i32_e32 v21, vcc, v4, v5 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v28, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v12, v0, v17 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v33, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, v5, v22 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v9, v9, v24 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v31, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v13, v13, v28 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v26 +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v19, v2 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v14, v31 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v14, v1, v16 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v19, v4 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v16, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v8, v25 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v14, v20 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v20, v7, v33 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v1, v16 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v14, v35, v31 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v34, v5 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v6, v5 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v16, 0 ; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v24, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v8, v9, v24 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v8, v8, v25 +; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, v6, v7 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v2, v10 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v9, v4, v1 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v0, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v23, v6 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v6, v7 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v2, v9 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v3, v11 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v12, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v5, v0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v13, v12 -; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v2, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v8, v22 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v1, v3 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v20, v4 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v22, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v0, v1 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v3, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v0, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v5, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v15 +; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, v2, v14 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v8, v20 +; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v8, v3, v13 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v4, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v21, v4 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v12, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v6, v2 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v1, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v3, v4 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v18, v5 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v10, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v9, v10 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v4, v5 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v6, v7 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -3542,142 +3536,143 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v16, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v34, v31, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v25, v35 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v34, v8, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v10, v26, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[32:33] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v35 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v16, v34, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v26, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v28, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v0, v[9:10] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v17 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v18, v[25:26] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v16, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v34, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v29, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v11, v17 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v28, v[2:3] +; GFX7-GISEL-NEXT: buffer_load_dword v13, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[11:12] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v22, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v0, 0 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v14, v13, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v2, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v13, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v23, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v24, v32, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v22, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v31, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v0, v[9:10] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v12 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v9, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v24, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v4, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v32, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v4, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v6, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v32, v3, v18 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v33, v3, v18 ; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v27, v10, v27 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v33, v11, v26 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v26, 0 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v10, v26, 0 ; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v19 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 -; GFX8-SDAG-NEXT: v_add_u32_e32 v32, vcc, v3, v32 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v14, v30, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v30, v4, v21 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v35, vcc, v3, v33 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v10, v27 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v11, v26 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v21, v4, v21 ; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v5, v20 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v29, v12, v29 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v34, v13, v28 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v12, v28, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v27 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v33 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v30 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v10, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v2, v11 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v13, v29 -; GFX8-SDAG-NEXT: v_add_u32_e32 v13, vcc, v4, v5 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v23, v6, v23 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v26, v7, v22 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v32, v3 +; GFX8-SDAG-NEXT: v_add_u32_e32 v26, vcc, v3, v10 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v6, v23 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v23, v7, v22 ; GFX8-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v22, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v32, v10 -; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v23 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v18, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v26 -; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v21, v2 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v7, v18 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v34 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v22, v12, v29 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v14, v30, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v3, v23 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 +; GFX8-SDAG-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 +; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v21 +; GFX8-SDAG-NEXT: v_add_u32_e32 v21, vcc, v4, v5 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v28, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v12, v0, v17 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v33, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v17, vcc, v5, v22 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v9, v9, v24 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v31, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v13, v13, v28 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v2, v26 +; GFX8-SDAG-NEXT: v_add_u32_e32 v13, vcc, v17, v13 +; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v19, v2 ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v14, v31 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v14, v1, v16 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v19, v4 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v15 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v16, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v8, v25 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v14, v20 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v20, v7, v33 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v1, v16 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v14, v35, v31 +; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v34, v5 +; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v15 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v6, v5 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v16, 0 ; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v24, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v8, v9, v24 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v15 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v8, v8, v25 +; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v6, v12 +; GFX8-SDAG-NEXT: v_add_u32_e32 v12, vcc, v6, v7 ; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v8 -; GFX8-SDAG-NEXT: v_add_u32_e32 v8, vcc, v2, v10 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v9, v4, v1 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v0, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v23, v6 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v14 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v6, v7 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v2, v9 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v3, v11 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v12, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v5, v0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v0, v13, v12 -; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v2, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v8, v22 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v1, v3 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v20, v4 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v22, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v0, v1 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v3, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v9 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v0, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v5, v1 +; GFX8-SDAG-NEXT: v_add_u32_e32 v8, vcc, v11, v15 +; GFX8-SDAG-NEXT: v_add_u32_e32 v9, vcc, v2, v14 +; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v8, v20 +; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v1 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v8, v3, v13 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v4, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v21, v4 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v0, v12, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v6, v2 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v1, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v3, v4 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v18, v5 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v10, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v0, v1 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v0, v9, v10 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v4, v5 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v6, v7 ; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -3687,63 +3682,64 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v16, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v34, v31, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v25, v35 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v34, v8, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v10, v26, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[32:33] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v35 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v16, v34, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v26, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v28, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v0, v[9:10] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v17 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v18, v[25:26] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v16, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v34, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v29, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v11, v17 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v28, v[2:3] +; GFX8-GISEL-NEXT: buffer_load_dword v13, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[11:12] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v22, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v0, 0 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v14, v13, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v2, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v13, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v23, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v24, v32, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v22, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v31, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v0, v[9:10] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v12 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v9, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v24, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v4, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v32, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v4, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v6, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3952,64 +3948,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[50:51], s4, v0, v17, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[64:65], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v38 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[50:51] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v49 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v4, v21, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v6, v23, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[38:39] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v50 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v31, v48, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v20, v[51:52] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v7, v22, v[54:55] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v25, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v10, v27, v[0:1] +; GFX10-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v65 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v53 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v33, v49, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[50:51], s4, v37, v64, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v12, v29, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v35, v52, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v51 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v14, v1, v[6:7] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v9, v24, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v11, v26, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v15, v30, v[38:39] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v13, v28, v[21:22] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v31, v6, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v33, v7, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v37, v8, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v19, v50, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v35, v9, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v17, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v64, v[6:7] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v2, v49, v[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v52, v[11:12] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, v13 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v19, v4, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v16, v48, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v2, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v50, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v9, v22, v[2:3] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v7, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index b5d9d00c48045..2adbd727235b5 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3923,30 +3923,30 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[16:17] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3975,29 +3975,23 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4005,8 +3999,14 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -4027,30 +4027,30 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[16:17] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -4079,29 +4079,23 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4109,8 +4103,14 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 2a989ecd2ebad..2c4e0060d41e1 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3923,30 +3923,30 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[16:17] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3975,29 +3975,23 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4005,8 +3999,14 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -4027,30 +4027,30 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[16:17] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -4079,29 +4079,23 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4109,8 +4103,14 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 69fd58aadfbcc..1dc388ea3c8fe 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3799,30 +3799,30 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[16:17] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3851,29 +3851,23 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3881,8 +3875,14 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -3903,30 +3903,30 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[16:17] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3955,29 +3955,23 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3985,8 +3979,14 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 1d3b42ee43b0f..188f0fbb7d3fe 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3535,30 +3535,30 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[16:17] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3587,29 +3587,23 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3617,8 +3611,14 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -3639,30 +3639,30 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[16:17] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[8:9] ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] @@ -3691,29 +3691,23 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3721,8 +3715,14 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc